Example #1
0
    def ProcessTables(self, dbCommon, tables):
        '''
        process steps:
        pulls file from share and place in raw folder
        '''
        try:

            processingFile = self.DownloadFile(self.job["srcCategories"])
            self.CreateCSVFile(processingFile, self.job["srcCategories"])

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            schemaAllString = SparkUtilities.BuildSparkSchema(tables, True)
            schema = SparkUtilities.BuildSparkSchema(tables)
            df = (spark.read
                    .format("com.databricks.spark.csv")
                    .options(header=False, delimiter=self.job["srcCategories"]["delimiter"])
                    .option("ignoreTrailingWhiteSpace", "true")
                    .option("ignoreLeadingWhiteSpace", "true")            
                    .schema(schemaAllString)
                    .load(self.fileUtilities.csvFolder)
                )
            df = SparkUtilities.ReplaceAll(df, "\xE2\x80\x93", "")
            df2 = SparkUtilities.ConvertTypesToSchema(df, schema)            
            SparkUtilities.SaveParquet(df2, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                    self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName + " - we had an error in ProcessTables")
            raise
Example #2
0
    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        Process the current table to load it up
        '''
        try:
            FileUtilities.EmptyFolderContents(
                self.fileUtilities.gzipFolder
            )  # Clear the folder from the previous run
            FileUtilities.EmptyFolderContents(
                self.fileUtilities.csvFolder
            )  # Clear the folder from the previous run
            url = dbCommon["urlPrefix"] + catalog["name"] + "." + dbCommon[
                "urlExt"]
            self.logger.info(self.moduleName + " - Processing url: " + url)

            localZipFilepath = self.fileUtilities.gzipFolder + "/" + \
                catalog["name"] + "." + dbCommon["urlExt"]

            self.fileUtilities.DownloadFromURL(url, localZipFilepath)

            self.fileUtilities.UnzipFile(localZipFilepath,
                                         self.fileUtilities.csvFolder)
            localFilepath = self.fileUtilities.csvFolder + "/" + catalog[
                "name"] + ".txt"

            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            dfMaster = spark.read.json(localFilepath)
            dfMaster = dfMaster.filter(dfMaster.series_id != "")
            for table in catalog["tables"]:

                self.logger.info(self.moduleName + " -- " +
                                 "Processing table: " + table["table"])
                # The column names being used in the source may be different from the once in the final
                # database.  Select columns based on source and then rename to destination
                schemaSrc = SparkUtilities.BuildSparkSchema(table,
                                                            useValidation=True)
                if table["dataSet"] == "attributes":
                    df = dfMaster.select(schemaSrc.names)
                elif table["dataSet"] == "data":
                    print(
                        dfMaster.rdd.take(5)
                    )  # There is some instability we need to monitor.  Print seems to slow down and stabilize the run???
                    df = dfMaster.rdd.flatMap(lambda row: EIAAthenaSpark.
                                              ProcessDataRecords(row)).toDF(
                                                  schemaSrc.names)
                else:
                    raise ValueError("Undefined dataSet type")

                schemaDst = SparkUtilities.BuildSparkSchema(table)
                df = SparkUtilities.RenameColumnsToSchema(df, schemaDst)
                df = SparkUtilities.ConvertTypesToSchema(df, schemaDst)
                self.logger.info(self.moduleName + " -- " + "Done reading " +
                                 str(df.count()) +
                                 " rows.  Now saving as parquet file...")

                FileUtilities.EmptyFolderContents(
                    self.fileUtilities.sqlFolder
                )  # Clear the folder from the previous run
                SparkUtilities.SaveParquet(df, self.fileUtilities)
                self.UploadFilesCreateAthenaTablesAndSqlScripts(
                    table, self.fileUtilities.parquet)
                self.LoadDataFromAthenaIntoRedShiftLocalScripts(table)

            self.logger.debug(self.moduleName + " -- " +
                              "ProcessS3File for: " + url + " finished.\n\n")
        except:
            self.logger.exception("we had an error in EIA on ProcessS3File")
            raise