def CreateFolders(self):
     '''
     Creates folder if it doesn't exist
     If it already exists, empties the folder contents
     '''
     FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                       self.job["downloadPath"])
     FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                       "/cleaned/")
     FileUtilities.EmptyFolderContents(self.localTempDirectory + "/packed/")
     for fp in self.job["foxpro_files"]:
         FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                           "/packed/" + fp["Name"] + "/")
 def CreateFolders(self):
     '''
     Creates folders
     '''
     FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                       self.job["folderPath"]["raw"])
     FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                       self.job["folderPath"]["ident"])
     FileUtilities.EmptyFolderContents(
         self.localTempDirectory + self.job["folderPath"]["transactions"])
     FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                       self.job["folderPath"]["contracts"])
     FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                       self.job["folderPath"]["indexPub"])
Exemple #3
0
 def MoveToS3(self, localFolderName, folderName, subFolder):
     '''
     move gzip files to s3 and clean local instance
     localFolderName --> local folder name
     subFolder --> date
     folderName --> folder name on s3
     '''
     try:
         self.logger.debug(self.moduleName + " -- " + "MoveToS3 " +
                           localFolderName + " starting ")
         ###
         #  move any gzip files to the s3 server
         ###
         s3folder = "s3://" + self.job["bucketName"] + self.job["s3GzipFolderBase"] +\
                     "/" + folderName + '/' + subFolder
         localFolder = self.fileUtilities.gzipFolder + localFolderName
         S3Utilities.SyncFolderAWSCli(localFolder,
                                      s3folder,
                                      args='''--quiet --include "*.gz"''',
                                      dbug="Y")
         # Cleanup local files
         FileUtilities.EmptyFolderContents(localFolder)
         self.logger.debug(self.moduleName + " -- " + "MoveToS3 " +
                           localFolderName + " finished ")
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in MoveToS3")
         raise
Exemple #4
0
    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        Process each file
        '''
        # Load the data from the S3 data lake into Redshift using Athena/Redshift Spectrum
        s3Key = dbCommon["s3SrcDirectory"] + "/" + catalog["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        FileUtilities.EmptyFolderContents(
            self.fileUtilities.gzipFolder
        )  # Clear the folder from the previous run
        FileUtilities.EmptyFolderContents(
            self.fileUtilities.csvFolder
        )  # Clear the folder from the previous run
        fileName = ntpath.basename(s3Key)
        localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName
        S3Utilities.S3Copy(s3Key, localGzipFilepath)

        localExcelFilepath = self.fileUtilities.csvFolder + "/" + fileName
        # Remove the gz extension
        localExcelFilepath = re.sub(r'\.gz$', '', localExcelFilepath)
        self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath)

        # Don't have a raw excel reader for Spark so use Pandas
        self.logger.info(self.moduleName + " - Processing Excel file: " +
                         localExcelFilepath)
        pandasDf = pd.read_excel(localExcelFilepath,
                                 catalog["excelSheetName"],
                                 index_col=None,
                                 na_values=['NaN'],
                                 skiprows=catalog["skipRows"])
        pandasDf = PandasUtilities.ConvertDateTimeToObject(pandasDf)

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        table = catalog["tables"][0]  # There is only table in a catalog
        schema = SparkUtilities.BuildSparkSchema(table)
        df = spark.createDataFrame(pandasDf, schema)
        df = SparkUtilities.ConvertNanToNull(df)
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(
            table, self.fileUtilities.parquet)
        self.LoadDataFromAthenaIntoRedShiftS3Scripts(table)
        self.logger.debug(self.moduleName + " -- " +
                          "ProcessS3File for file: " + s3Key +
                          " finished.\n\n")
Exemple #5
0
    def ProcessTables(self, dbCommon, tables):
        '''
        Process each file
        '''
        self.logger.debug(self.moduleName + " -- ProcessTables for  " + tables["table"] + " starting")
        FileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder)   # Clear the folder from the previous run
        self.ProcessFiles(tables)
        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        
        # We will compute "period_type" later
        schemaWithoutPeriodType = SparkUtilities.BuildSparkSchema(tables, excludeComputed=True)
        df = (spark.read
                .format("com.databricks.spark.csv")
                .options(header=False, delimiter=self.job['delimiter'],
                         ignoreTrailingWhiteSpace=True, ignoreLeadingWhiteSpace=True)
                .schema(schemaWithoutPeriodType)
                .load(self.fileUtilities.csvFolder)
            )        

        if "filterData" in tables:
            df = df.filter(tables["filterData"])
        
        # Replace "NEW" with blank.  E.g. DEC1990NEW to DEC1990
        from pyspark.sql import functions as F  #@UnresolvedImport
        df = SparkUtilities.RenameColumnsInList(df, [("period", "period_old")]) # Rename column since we cannot edit in place
        df = df.withColumn("period", F.regexp_replace(df["period_old"], "NEW", ""))

        # Compute "period_type".  Following simple rules have been applied
        #    MAY2013 - 7 characters so assumed to be 'M'
        #    Q12017  - 6 characters so assumed to be 'Q'
        #    2017    - 4 characters so assumed to be 'Y'
        df = df.withColumn("period_type", F.when(F.length(df.period)==7, "M").when(F.length(df.period)==6, "Q").when(F.length(df.period)==4, "Y").otherwise(""))
        
        # Reorder the columns based on the input column order
        schema = SparkUtilities.BuildSparkSchema(tables)
        df = df.select(schema.names)
        
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet)
        self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables)
        self.logger.debug(self.moduleName + " -- ProcessTables for  " + tables["table"] + " finished")
Exemple #6
0
 def MoveToS3(self):
     '''
     move gzip files to s3 and clean local instance
     '''
     try:
         self.logger.debug(self.moduleName + " -- " + "MoveToS3 " +
                           " starting ")
         ###
         #  move any gzip files to the s3 server
         ###
         s3folder = "s3://" + self.job["bucketName"] + self.job[
             "s3GzipFolderBase"]
         S3Utilities.SyncFolderAWSCli(self.fileUtilities.gzipFolder,
                                      s3folder,
                                      args='''--quiet --include "*.gz"''',
                                      dbug="N")
         # Cleanup local files
         FileUtilities.EmptyFolderContents(self.fileUtilities.gzipFolder)
         self.logger.debug(self.moduleName + " -- " + "MoveToS3 " +
                           " finished ")
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in MoveToS3")
         raise
Exemple #7
0
    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        Process the current table to load it up
        '''
        try:
            FileUtilities.EmptyFolderContents(
                self.fileUtilities.gzipFolder
            )  # Clear the folder from the previous run
            FileUtilities.EmptyFolderContents(
                self.fileUtilities.csvFolder
            )  # Clear the folder from the previous run
            url = dbCommon["urlPrefix"] + catalog["name"] + "." + dbCommon[
                "urlExt"]
            self.logger.info(self.moduleName + " - Processing url: " + url)

            localZipFilepath = self.fileUtilities.gzipFolder + "/" + \
                catalog["name"] + "." + dbCommon["urlExt"]

            self.fileUtilities.DownloadFromURL(url, localZipFilepath)

            self.fileUtilities.UnzipFile(localZipFilepath,
                                         self.fileUtilities.csvFolder)
            localFilepath = self.fileUtilities.csvFolder + "/" + catalog[
                "name"] + ".txt"

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            dfMaster = spark.read.json(localFilepath)
            dfMaster = dfMaster.filter(dfMaster.series_id != "")
            for table in catalog["tables"]:

                self.logger.info(self.moduleName + " -- " +
                                 "Processing table: " + table["table"])
                # The column names being used in the source may be different from the once in the final
                # database.  Select columns based on source and then rename to destination
                schemaSrc = SparkUtilities.BuildSparkSchema(table,
                                                            useValidation=True)
                if table["dataSet"] == "attributes":
                    df = dfMaster.select(schemaSrc.names)
                elif table["dataSet"] == "data":
                    print(
                        dfMaster.rdd.take(5)
                    )  # There is some instability we need to monitor.  Print seems to slow down and stabilize the run???
                    df = dfMaster.rdd.flatMap(lambda row: EIAAthenaSpark.
                                              ProcessDataRecords(row)).toDF(
                                                  schemaSrc.names)
                else:
                    raise ValueError("Undefined dataSet type")

                schemaDst = SparkUtilities.BuildSparkSchema(table)
                df = SparkUtilities.RenameColumnsToSchema(df, schemaDst)
                df = SparkUtilities.ConvertTypesToSchema(df, schemaDst)
                self.logger.info(self.moduleName + " -- " + "Done reading " +
                                 str(df.count()) +
                                 " rows.  Now saving as parquet file...")

                FileUtilities.EmptyFolderContents(
                    self.fileUtilities.sqlFolder
                )  # Clear the folder from the previous run
                SparkUtilities.SaveParquet(df, self.fileUtilities)
                self.UploadFilesCreateAthenaTablesAndSqlScripts(
                    table, self.fileUtilities.parquet)
                self.LoadDataFromAthenaIntoRedShiftLocalScripts(table)

            self.logger.debug(self.moduleName + " -- " +
                              "ProcessS3File for: " + url + " finished.\n\n")
        except:
            self.logger.exception("we had an error in EIA on ProcessS3File")
            raise
 def EmptyPackedFolder(self):
     '''
     Empties the packed folder
     '''
     FileUtilities.EmptyFolderContents(self.localTempDirectory + "/packed/")
Exemple #9
0
    def Start(self, logger, moduleName, filelocs):
        '''
        main routine for Totem
        '''
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)
            self.SetUpLocalEnvironment()

            self.currProcId = self.etlUtilities.GetRunID(
                filelocs["tblEtl"]["table"], self.moduleName)
            lastRunRecJson = self.etlUtilities.GetLastGoodRun(
                filelocs["tblEtl"]["table"], self.moduleName)
            paramsList = []
            if lastRunRecJson is not None:
                paramsList = json.loads(lastRunRecJson["params"])
###
#  if we have run this before let's get a list of what they were so that we can use it
###
            prevmonth = []
            currmonth = []

            lenDatesArray = len(paramsList)
            if lenDatesArray > 0:
                if "currmonth" in paramsList:
                    prevmonth = paramsList["currmonth"]
###
#  check and make sure we at least process the current month
###
            prevmonth, currmonth, tdArray = self.CleantdArray(prevmonth)
            for dte in tdArray:
                ##
                #  run thru the dates and find the files associated with each date
                ##
                #                if dte > '2010-05':
                #                    continue
                self.logger.debug(self.moduleName + " -- " +
                                  "date processing " + dte)
                tflArray = self.GetFileList(dte)
                for fls in tflArray:
                    self.GetFile(dte, fls)
                    FileUtilities.EmptyFolderContents(
                        self.fileUtilities.csvFolder)
                    FileUtilities.EmptyFolderContents(self.localTempDirectory +
                                                      '/working')
            self.MoveToS3()
            ###
            #  now load the s3 files into Redshift
            ###
            self.LoadData()
            if self.etlUtilities.SetInstanceParameters(filelocs["tblEtl"]["table"],\
                                                  self.currProcId,\
                                                  json.dumps({"lastrun":prevmonth, "currmonth": currmonth})) is not True:
                self.logger.info(self.moduleName +
                                 " - we could not set the instance.")

            self.UpdateTable(filelocs["tblEtl"]["schemaName"],
                             filelocs["tblEtl"]["table"])
            if self.job["cleanlocal"] == "Y":
                for fld in self.job["folders"]:
                    self.fileUtilities.CreateLocalFolder(fld)

            self.logger.info(self.moduleName + " - Finished processing.")
        except:
            self.logger.exception(moduleName + " - Exception!")
            if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"],\
                                             self.currProcId, 'F') is not True:
                self.logger.info(self.moduleName +
                                 " - we could not Complete Instance.")
            raise
Exemple #10
0
class ConsensusAthenaSpark(ApplicationBase):
    '''
    This class is used to control the data load process from different OPEC file sources.
    '''
    def __init__(self):
        '''
        Initial settings
        '''
        super(ConsensusAthenaSpark, self).__init__()
        self.rawFolder = None
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))

    def BulkDownload(self):
        '''
        Download all files.
        '''
        sharedFiles = self.fileUtilities.ScanFolder(
            self.job["srcSharedFolder"])

        self.logger.info(self.moduleName +
                         " - Downloading files from shared folder...")

        for fileName in sharedFiles:
            if (fileName[:2] == self.job["fileNamePrefix"]
                ) and os.path.splitext(fileName)[1] in self.job["validExts"]:
                shutil.copyfile(
                    os.path.join(self.job["srcSharedFolder"], fileName),
                    self.rawFolder + "/" + fileName)

    def DfCleanUp(self, df, surveyDateVal):
        '''
        Converts the actual excel file into csv for the worksheet configured.
        '''
        bankNameColumnIn = "Unnamed: 0"
        surveyDateColName = "surveyDate"

        for colName in self.job["columnsToDrop"]:
            df = df.drop(colName, 1)

        df = df.drop(self.job["dropAfterHeader"], 0)

        for colName in df.head(0):
            dtTest = colName

            if not isinstance(dtTest,
                              datetime) and colName != bankNameColumnIn:
                df = df.drop(colName, 1)

        df = df.assign(surveyDate=surveyDateVal)

        newOrder = [surveyDateColName]

        for colName in df.head(0):
            if colName != surveyDateColName:
                newOrder.append(colName)

        df = df[newOrder]
        df = df.melt(id_vars=[surveyDateColName, bankNameColumnIn])

        return df

    def GetData(self, rawFileName, mode=None):
        '''
        Returns the data frame or survey date
        '''

        if mode == "getSurveyDate":
            skipRows = 0
        else:
            skipRows = self.job["skipRows"]

        df = pandas.read_excel(rawFileName,
                               sheetname=self.job["worksheetName"],
                               index_col=None,
                               na_values=["na"],
                               skiprows=skipRows,
                               skip_footer=self.job["skipFooter"])

        if mode == "getSurveyDate":
            valRerturn = df.iloc[self.job["surveyDateRow"] - 2][0]
        else:
            valRerturn = df

        return valRerturn

    @staticmethod
    def FormatSurveyDate(emFile):
        '''
        Returns the date based on the file's name
        '''
        surveyDateColValue = os.path.splitext(emFile)[0]
        surveyDateColValue = surveyDateColValue[2:len(surveyDateColValue)]
        surveyDateColValue = surveyDateColValue.replace("CF", "")
        surveyDateColValue = str(surveyDateColValue[3:]) + "-" + str(
            list(calendar.month_abbr).index(surveyDateColValue[:3])) + "-01"
        return surveyDateColValue

    def ProcessFiles(self):
        '''
        Controls the workflow for the conversion, clean up and pack of the input files.
        '''
        filesToProcess = self.fileUtilities.ScanFolder(self.rawFolder)

        for emFile in filesToProcess:
            self.logger.info(self.moduleName + " - Processing file: " + emFile)

            rawFileName = self.rawFolder + "/" + emFile
            csvFilename = self.fileUtilities.csvFolder + os.path.splitext(
                emFile)[0] + ".csv"

            try:
                surveyDatedt = self.GetData(rawFileName, "getSurveyDate")

                if isinstance(surveyDatedt, float):
                    surveyDatedt = self.FormatSurveyDate(emFile)
                elif isinstance(surveyDatedt, basestring):
                    if "," in surveyDatedt:
                        tmpDatedt = datetime.strptime(surveyDatedt,
                                                      '%B %d, %Y')
                        surveyDatedt = datetime.strftime(tmpDatedt, "%Y-%m-%d")

                df = self.GetData(rawFileName)
                df = self.DfCleanUp(df, surveyDatedt)

                df.to_csv(csvFilename,
                          header=False,
                          sep=str(self.job["delimiter"]),
                          encoding='utf-8',
                          index=False)

            except XLRDError:
                self.logger.info(self.moduleName + " - No tab named '" +
                                 self.job["worksheetName"] + "' in " + emFile)
            except Exception:
                self.logger.error(self.moduleName +
                                  " - Error while trying to process " + emFile)
                raise
            finally:
                FileUtilities.RemoveFileIfItExists(rawFileName)

    def ProcessTables(self, dbCommon, tables):
        '''
        process steps:
        pulls file from share and place in raw folder
        '''
        try:
            self.rawFolder = self.localTempDirectory + "/" + "Raw"
            self.BulkDownload()
            self.ProcessFiles()
            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = SparkUtilities.ReadCSVFile(spark, tables,
                                            self.job["delimiter"], False,
                                            self.fileUtilities.csvFolder,
                                            self.logger)
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.fileUtilities.EmptyFolderContents(
                self.fileUtilities.csvFolder)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessRequest")
            raise

    def Start(self, logger, moduleName, filelocs):
        '''
        Start of routine
        '''
        ApplicationBase.Start(self, logger, moduleName, filelocs)
        # At some point this will be part of Start
        ApplicationBase.ProcessInput(self, logger, moduleName, filelocs)