Python SparkUtilities.BuildSparkSchemaの例、AACloudTools.SparkUtilities.SparkUtilities.BuildSparkSchema Pythonの例

コード例 #1

0

ファイルを表示

    def ProcessTables(self, dbCommon, tables):
        '''
        process steps:
        pulls file from share and place in raw folder
        '''
        try:

            processingFile = self.DownloadFile(self.job["srcCategories"])
            self.CreateCSVFile(processingFile, self.job["srcCategories"])

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            schemaAllString = SparkUtilities.BuildSparkSchema(tables, True)
            schema = SparkUtilities.BuildSparkSchema(tables)
            df = (spark.read
                    .format("com.databricks.spark.csv")
                    .options(header=False, delimiter=self.job["srcCategories"]["delimiter"])
                    .option("ignoreTrailingWhiteSpace", "true")
                    .option("ignoreLeadingWhiteSpace", "true")            
                    .schema(schemaAllString)
                    .load(self.fileUtilities.csvFolder)
                )
            df = SparkUtilities.ReplaceAll(df, "\xE2\x80\x93", "")
            df2 = SparkUtilities.ConvertTypesToSchema(df, schema)            
            SparkUtilities.SaveParquet(df2, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                    self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName + " - we had an error in ProcessTables")
            raise

コード例 #2

0

ファイルを表示

ファイル: GEForecastAthenaSpark.py プロジェクト: eulertech/backup

    def CreateParquetFilesAndLoad(self, catalog, partitionValue):
        '''
        Creates the parquet files
        '''
        try:
            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            for tables in catalog["tables"]:
                if tables["type"] == "attributes":
                    srcFolder = self.fileUtilities.csvFolder + '/attribute/'
                else:
                    srcFolder = self.fileUtilities.csvFolder + '/data/'
                tableSchema = SparkUtilities.BuildSparkSchema(tables)

                df = (spark.read.format("com.databricks.spark.csv").options(
                    header=False, delimiter=self.job["delimiter"]).schema(
                        tableSchema).load(srcFolder))
                SparkUtilities.SaveParquet(df, self.fileUtilities)
                self.UploadFilesCreateAthenaTablesAndSqlScripts(
                    tables, self.fileUtilities.parquet, partitionValue)
                self.fileUtilities.EmptyFolderContents(
                    self.fileUtilities.parquet)
                self.fileUtilities.EmptyFolderContents(srcFolder)

        except Exception as ex:
            self.logger.exception(
                self.moduleName +
                " - we had an error in CreateParquetFilesAndLoad " +
                ex.message)
            raise

コード例 #3

0

ファイルを表示

    def ProcessTables(self, dbCommon, tables):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            self.logger.debug(self.moduleName + " -- " + "LoadCategory" +
                              " starting ")
            processingFile = self.DownloadFile()
            fileOut = processingFile.replace(".dbf", ".txt")

            dbfUtils = DBFUtilities(self.logger)
            dbfUtils.ConvertToCSV(processingFile, fileOut,
                                  self.job["delimiter"], False)

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            schema = SparkUtilities.BuildSparkSchema(tables)
            df = (spark.read.format("com.databricks.spark.csv").options(
                header='false',
                delimiter=self.job["delimiter"]).schema(schema).load(fileOut))
            self.logger.info(self.moduleName + " -- " + "Done reading " +
                             str(df.count()) +
                             " rows.  Now saving as parquet file...")
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
            self.logger.debug(self.moduleName + " -- " + "LoadCategory" +
                              " finished ")
        except Exception as err:
            self.logger.error(self.moduleName +
                              " - Error while trying to load category...")
            raise Exception(err.message)

コード例 #4

0

ファイルを表示

    def ProcessTable(self, table):
        '''
        Process the data for the table
        '''
        s3Key = table["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        self.CreateFolders(table["table"])

        fileName = ntpath.basename(s3Key)
        localTxtFilepath = self.fileUtilities.csvFolder + "/" + fileName
        S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                       self.job["bucketName"], s3Key,
                                       localTxtFilepath)

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        schema = SparkUtilities.BuildSparkSchema(table)
        df = (spark.read.format("com.databricks.spark.csv").options(
            header='false', delimiter=self.job["delimiter"]).schema(
                schema).load(localTxtFilepath))
        self.logger.info(self.moduleName + " -- " + "Done reading " +
                         str(df.count()) +
                         " rows.  Now saving as parquet file...")
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(
            table, self.fileUtilities.parquet, table["partitionValue"])
        self.logger.info(self.moduleName + " -- " + "ProcessTable " +
                         " finished ")

コード例 #5

0

ファイルを表示

    def ProcessTables(self, dbCommon, tables):
        '''
        Will load the ENP Yearly Table
        '''
        try:
            outputfileName = self.fileUtilities.csvFolder + '/ENPdata.csv'

            conn = self.EstablishConnection(dbCommon)
            cur = conn.cursor()
            sqlline = self.FixSQLStatement(dbCommon)
            cur.execute(sqlline)

            self.ConvertToCSV(cur, outputfileName)

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            schema = SparkUtilities.BuildSparkSchema(tables)
            df = (spark.read.format("com.databricks.spark.csv").options(
                header='false', delimiter=self.job["delimiter"]).schema(
                    schema).load(outputfileName))
            self.logger.info(self.moduleName + " -- " + "Done reading " +
                             str(df.count()) +
                             " rows.  Now saving as parquet file...")
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except Exception as err:
            self.logger.error(self.moduleName +
                              " - Error while trying to load table. Error:" +
                              err.message)
            raise

コード例 #6

0

ファイルを表示

    def ProcessTables(self, dbCommon, tables):
        '''
        Process each Vantage table.
        '''
        try:
            self.logger.debug(self.moduleName + " -- " +
                              "Processing data for table:" + tables["table"])
            fileName = self.BulkExtract(tables["table"], tables["scriptFile"],
                                        dbCommon)

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            schema = SparkUtilities.BuildSparkSchema(tables)
            df = (spark.read.format("com.databricks.spark.csv").options(
                header='false',
                delimiter=self.job["delimiter"]).schema(schema).load(fileName))

            self.logger.info(self.moduleName + " -- " + "Done reading " +
                             str(df.count()) +
                             " rows.  Now saving as parquet file...")
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except Exception as err:
            self.logger.error(self.moduleName +
                              " - Error while trying to load table. Error: " +
                              err.message)
            raise

コード例 #7

0

ファイルを表示

ファイル: IEAAthenaSpark.py プロジェクト: eulertech/backup

    def ProcessTables(self, dbCommon, tables):
        '''
        Process each file
        '''
        self.logger.debug(self.moduleName + " -- ProcessTables for  " + tables["table"] + " starting")
        FileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder)   # Clear the folder from the previous run
        self.ProcessFiles(tables)
        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        
        # We will compute "period_type" later
        schemaWithoutPeriodType = SparkUtilities.BuildSparkSchema(tables, excludeComputed=True)
        df = (spark.read
                .format("com.databricks.spark.csv")
                .options(header=False, delimiter=self.job['delimiter'],
                         ignoreTrailingWhiteSpace=True, ignoreLeadingWhiteSpace=True)
                .schema(schemaWithoutPeriodType)
                .load(self.fileUtilities.csvFolder)
            )        

        if "filterData" in tables:
            df = df.filter(tables["filterData"])
        
        # Replace "NEW" with blank.  E.g. DEC1990NEW to DEC1990
        from pyspark.sql import functions as F  #@UnresolvedImport
        df = SparkUtilities.RenameColumnsInList(df, [("period", "period_old")]) # Rename column since we cannot edit in place
        df = df.withColumn("period", F.regexp_replace(df["period_old"], "NEW", ""))

        # Compute "period_type".  Following simple rules have been applied
        #    MAY2013 - 7 characters so assumed to be 'M'
        #    Q12017  - 6 characters so assumed to be 'Q'
        #    2017    - 4 characters so assumed to be 'Y'
        df = df.withColumn("period_type", F.when(F.length(df.period)==7, "M").when(F.length(df.period)==6, "Q").when(F.length(df.period)==4, "Y").otherwise(""))
        
        # Reorder the columns based on the input column order
        schema = SparkUtilities.BuildSparkSchema(tables)
        df = df.select(schema.names)
        
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet)
        self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables)
        self.logger.debug(self.moduleName + " -- ProcessTables for  " + tables["table"] + " finished")

コード例 #8

0

ファイルを表示

    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        Process each file
        '''
        # Load the data from the S3 data lake into Redshift using Athena/Redshift Spectrum
        s3Key = dbCommon["s3SrcDirectory"] + "/" + catalog["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        FileUtilities.EmptyFolderContents(
            self.fileUtilities.gzipFolder
        )  # Clear the folder from the previous run
        FileUtilities.EmptyFolderContents(
            self.fileUtilities.csvFolder
        )  # Clear the folder from the previous run
        fileName = ntpath.basename(s3Key)
        localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName
        S3Utilities.S3Copy(s3Key, localGzipFilepath)

        localExcelFilepath = self.fileUtilities.csvFolder + "/" + fileName
        # Remove the gz extension
        localExcelFilepath = re.sub(r'\.gz$', '', localExcelFilepath)
        self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath)

        # Don't have a raw excel reader for Spark so use Pandas
        self.logger.info(self.moduleName + " - Processing Excel file: " +
                         localExcelFilepath)
        pandasDf = pd.read_excel(localExcelFilepath,
                                 catalog["excelSheetName"],
                                 index_col=None,
                                 na_values=['NaN'],
                                 skiprows=catalog["skipRows"])
        pandasDf = PandasUtilities.ConvertDateTimeToObject(pandasDf)

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        table = catalog["tables"][0]  # There is only table in a catalog
        schema = SparkUtilities.BuildSparkSchema(table)
        df = spark.createDataFrame(pandasDf, schema)
        df = SparkUtilities.ConvertNanToNull(df)
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(
            table, self.fileUtilities.parquet)
        self.LoadDataFromAthenaIntoRedShiftS3Scripts(table)
        self.logger.debug(self.moduleName + " -- " +
                          "ProcessS3File for file: " + s3Key +
                          " finished.\n\n")

コード例 #9

0

ファイルを表示

ファイル: PGCRAirMarketsAthenaSpark.py プロジェクト: eulertech/backup

    def ProcessTable(self,table):
        '''
        Process data for the table
        :param table:
        :return:
        '''

        s3Key = self.job["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        self.fileUtilities.moduleName = self.moduleName
        self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + table["table"]
        self.fileUtilities.CreateFolders(self.job["folders"])

        fileName = ntpath.basename(s3Key)

        local7zipFilePath = self.fileUtilities.gzipFolder+ "/" +fileName

        S3Utilities.DownloadFileFromS3(self.awsParams.s3,self.job["bucketName"],
                                       s3Key,local7zipFilePath)

        localCsvFilepath = self.fileUtilities.csvFolder + "/" + fileName
        localCsvFilepath = re.sub(r'\.zip$', '', localCsvFilepath)


        self.fileUtilities.UnzipUsing7z(local7zipFilePath,localCsvFilepath)
        fileToBeloaded = localCsvFilepath+'/'+'emission_05-11-2017.csv'

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        schema = SparkUtilities.BuildSparkSchema(table)

        df = (spark.read
              .format("com.databricks.spark.csv")
              .options(header='true', delimiter=self.job["delimiter"],ignoreTrailingWhiteSpace='true')
              .schema(schema)
              .load(fileToBeloaded)
              )

        #df.show()
        self.logger.info(
            self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows.  Now saving as parquet file...")
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(table,self.fileUtilities.parquet)
        self.logger.info(self.moduleName + " -- " + "UploadFilesCreateAthenaTablesAndSqlScripts " + " finished ")

コード例 #10

0

ファイルを表示

samplejson = '''{
    "fields": [
        {"metadata": {}, "nullable": true, "name": "sourceset", "type": "string"}, 
        {"metadata": {}, "nullable": true, "name": "sourcesetdesc", "type": "string"}, 
        {"metadata": {}, "nullable": true, "name": "market_and_exchange_names", "type": "string"}, 
        {"metadata": {}, "nullable": true, "name": "as_of_date_in_form_yymmdd", "type": "integer"}, 
        {"metadata": {}, "nullable": true, "name": "report_date_as_yyyy_mm_dd", "type": "string"}
    ],
    "type": "struct"
}'''
#schemaJson = json.loads(samplejson)
#from pyspark.sql.types import StructType#@UnresolvedImport

#schema = StructType.fromJson(schemaJson)

schema = SparkUtilities.BuildSparkSchema(table)

#print(schema)
localTxtFilepath = "C:/tmp/testfiles/COTHist2011.csv"

df = (sqlContext.read.format("csv").option("header", "true").option(
    "delimiter",
    ",").option("ignoreTrailingWhiteSpace",
                "true").option("ignoreLeadingWhiteSpace",
                               "true").schema(schema).load(localTxtFilepath))
df.printSchema()

df.show()
sc.stop()

コード例 #11

0

ファイルを表示

ファイル: EIAAthenaSpark.py プロジェクト: eulertech/backup

    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        Process the current table to load it up
        '''
        try:
            FileUtilities.EmptyFolderContents(
                self.fileUtilities.gzipFolder
            )  # Clear the folder from the previous run
            FileUtilities.EmptyFolderContents(
                self.fileUtilities.csvFolder
            )  # Clear the folder from the previous run
            url = dbCommon["urlPrefix"] + catalog["name"] + "." + dbCommon[
                "urlExt"]
            self.logger.info(self.moduleName + " - Processing url: " + url)

            localZipFilepath = self.fileUtilities.gzipFolder + "/" + \
                catalog["name"] + "." + dbCommon["urlExt"]

            self.fileUtilities.DownloadFromURL(url, localZipFilepath)

            self.fileUtilities.UnzipFile(localZipFilepath,
                                         self.fileUtilities.csvFolder)
            localFilepath = self.fileUtilities.csvFolder + "/" + catalog[
                "name"] + ".txt"

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            dfMaster = spark.read.json(localFilepath)
            dfMaster = dfMaster.filter(dfMaster.series_id != "")
            for table in catalog["tables"]:

                self.logger.info(self.moduleName + " -- " +
                                 "Processing table: " + table["table"])
                # The column names being used in the source may be different from the once in the final
                # database.  Select columns based on source and then rename to destination
                schemaSrc = SparkUtilities.BuildSparkSchema(table,
                                                            useValidation=True)
                if table["dataSet"] == "attributes":
                    df = dfMaster.select(schemaSrc.names)
                elif table["dataSet"] == "data":
                    print(
                        dfMaster.rdd.take(5)
                    )  # There is some instability we need to monitor.  Print seems to slow down and stabilize the run???
                    df = dfMaster.rdd.flatMap(lambda row: EIAAthenaSpark.
                                              ProcessDataRecords(row)).toDF(
                                                  schemaSrc.names)
                else:
                    raise ValueError("Undefined dataSet type")

                schemaDst = SparkUtilities.BuildSparkSchema(table)
                df = SparkUtilities.RenameColumnsToSchema(df, schemaDst)
                df = SparkUtilities.ConvertTypesToSchema(df, schemaDst)
                self.logger.info(self.moduleName + " -- " + "Done reading " +
                                 str(df.count()) +
                                 " rows.  Now saving as parquet file...")

                FileUtilities.EmptyFolderContents(
                    self.fileUtilities.sqlFolder
                )  # Clear the folder from the previous run
                SparkUtilities.SaveParquet(df, self.fileUtilities)
                self.UploadFilesCreateAthenaTablesAndSqlScripts(
                    table, self.fileUtilities.parquet)
                self.LoadDataFromAthenaIntoRedShiftLocalScripts(table)

            self.logger.debug(self.moduleName + " -- " +
                              "ProcessS3File for: " + url + " finished.\n\n")
        except:
            self.logger.exception("we had an error in EIA on ProcessS3File")
            raise

コード例 #12

0

ファイルを表示

ファイル: GTAAthenaSpark.py プロジェクト: eulertech/backup

    def ProcessTables(self, dbCommon, tables):
        '''
        Process the current table to load it up
        '''
        try:
            self.logger.debug(self.moduleName + " -- ProcessTables for  " + tables["table"] + " starting")
            
            # Cleanup first (TODO - Need a more generic way to do this)
            self.fileUtilities.EmptyFolderContents(self.fileUtilities.sqlFolder)
            
            # Variables used for handling chunks.  -1 for full load
            chunkStart =  chunkEnd = maxValue = chunkSize = -1
            
            if "incrementalconditions" in tables:
                incrementalConditions = tables["incrementalconditions"]
                if "startID" in incrementalConditions:
                    chunkEnd = incrementalConditions["startID"] - 1
                else:
                    athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(tables["schemaName"])
                    try:
                        # This is where we last ended.  Start at 1 + this end
                        chunkEnd = int(AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, tables["table"], tables["distkey"], self.logger))
                    except ValueError:
                        chunkEnd = 0 # Table does not exist yet
                    except:
                        raise

                if "endID" in incrementalConditions:
                    maxValue = incrementalConditions["endID"]
                else:
                    # TODO - Fix this.  Also, we should start at the source min value not 0.
                    maxValue = 2000000000 #BCPUtilities.GetMaxValueSQLServer(dbCommon, tables, chunkStart)
                    
                chunkSize = tables["incrementalconditions"]["chunksize"]
                chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
                    
            fieldDelimiter = self.job["delimiter"]
            if "delimiter" in tables:
                fieldDelimiter = tables["delimiter"]
            
            while chunkStart <= maxValue:
                partitionValue = self.GetPartitionValue(tables, chunkStart)
                sqlPullDataScript = BCPUtilities.CreatePullScript(dbCommon, tables, chunkStart, chunkEnd,
                                                                  self.logger, self.fileUtilities, self.location)
                # Construct a file name that is meaning full.  That is, it has the start and end IDs
                outputCSV = self.fileUtilities.csvFolder + BCPUtilities.ComponseRangeString(chunkStart, chunkEnd) + ".csv"
                self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder)
                self.bcpUtilities.BulkExtract(sqlPullDataScript, outputCSV, dbCommon, tables, fieldDelimiter,
                                              self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger)
                # Process the data using Spark and save as Parquet
                spark = SparkUtilities.GetCreateSparkSession(self.logger)
                schema = SparkUtilities.BuildSparkSchema(tables)
                df = (spark.read
                         .format("com.databricks.spark.csv")
                         .options(header='false', delimiter=fieldDelimiter)
                         .schema(schema)
                         .load(self.fileUtilities.csvFolder)
                         )
                df.printSchema()
                df.show()
                df = SparkUtilities.ProcessSpecialCharsIfAny(df, tables)
            
                self.logger.info(self.moduleName + " -- " + "DONE READING " + str(df.count()) + " ROWS.  Now saving as parquet file...")
                self.fileUtilities.EmptyFolderContents(self.fileUtilities.parquet)
                SparkUtilities.SaveParquet(df, self.fileUtilities)
            
                # Need to load the data and clear the local space
                self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet, partitionValue)
                
                tables["new"] = "N" # Do not recreate
                if chunkSize < 0:
                    break;  # Done with the single load
                chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
            
            # TODO - Need to make sure we don't end up with duplicate data if we run the code
            # Twice on the same day
            self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)

            self.logger.debug(self.moduleName + " -- ProcessTables for  " + tables["table"] + " finished")
        except:
            self.logger.exception(self.moduleName + " - we had an error in ProcessDatabase for " + tables["table"])
            raise

コード例 #13

0

ファイルを表示

    def ProcessTables(self, dbCommon, tables):
        '''
        get the last partition value and use that as the date to pull data
        then put that data into Athena
        '''
        try:
            outputCSV = self.fileUtilities.csvFolder + self.moduleName + ".CSV"
            fieldTerminator = self.job["fieldTerminator"]
            if "fieldTerminator" in tables:
                fieldTerminator = tables["fieldTerminator"]
            rawFolder = self.localTempDirectory + "/raw/"
            rowTerminator = None  # Not using this. Stick with the default of CR/LF.  self.job["rowTerminator"]

            if "pullTemplate" in tables:
                lastRunDate = self.GetParameters(tables)
                formattedLastRunDate = lastRunDate[4:6] + '/' + lastRunDate[
                    6:8] + '/' + lastRunDate[:4]
                sqlPullDataScript = self.CreatePullScript(
                    tables, formattedLastRunDate)
                self.bcpUtilities.BulkExtract(
                    self.fileUtilities.LoadSQLQuery(sqlPullDataScript),
                    outputCSV, dbCommon, tables, fieldTerminator,
                    rowTerminator, self.job["bcpUtilityDirOnLinux"],
                    self.fileUtilities, self.logger)

                self.masterSchema = SparkUtilities.BuildSparkSchema(tables)
                self.fileUtilities.MoveFilesFromOneFolderToAnother(self.fileUtilities.csvFolder,\
                                                                   rawFolder,\
                                                                   '*.csv')
                return
###
#  load data frame from CSV file
###
            partitionValue = self.GetPartitionValue()
            self.fileUtilities.EmptyFolderContents(
                self.fileUtilities.csvFolder)
            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = (spark.read.format("com.databricks.spark.csv").options(
                header=False, delimiter=fieldTerminator).schema(
                    self.masterSchema).load(rawFolder))
            cols = []
            for field in tables["fields"]:
                if "athenaOnly" in field:
                    if field["athenaOnly"] != "Y":
                        cols.append(field["name"])
                else:
                    cols.append(field["name"])
            if tables["type"] == "attributes":
                dfAttributes = df.select(cols).distinct()
                if dfAttributes.count() == 0:
                    self.logger.debug(
                        self.moduleName +
                        " - no records to process for Attribute data")
                    return
                SparkUtilities.SaveParquet(dfAttributes, self.fileUtilities)
            elif tables["type"] == "series":
                dfSeries = df.select(cols)
                if "adjustFormat" in tables:
                    for fld in tables["adjustFormat"]:
                        dfSeries = SparkUtilities.FormatColumn(
                            dfSeries, fld["name"], fld["inputFormat"])
                if dfSeries.count() == 0:
                    self.logger.debug(
                        self.moduleName +
                        " - no records to process for Series data")
                    return
                SparkUtilities.SaveParquet(dfSeries, self.fileUtilities)

            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet, partitionValue)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessTables")
            raise