Ejemplo n.º 1
0
    def ProcessTables(self, dbCommon, tables):
        '''
        pulls data from different sheets and put that information into csv file
        '''
        try:
            xl = ExcelUtilities(self.logger)
            localFilepath = self.fileUtilities.FindMostCurrentFile(
                self.job["foldertoscan"])
            csvfile = self.CreateCsvFile(tables)
            csvWriter = csv.writer(csvfile, quoting=csv.QUOTE_ALL)

            if localFilepath is not None:
                self.ProcessFile(xl, localFilepath, csvWriter)

            csvfile.close()
            spark = SparkUtilities.GetCreateSparkSession(self.logger)
            df = SparkUtilities.ReadCSVFile(spark, tables,
                                            self.job["delimiter"], False,
                                            self.fileUtilities.csvFolder,
                                            self.logger)
            if "adjustFormat" in tables:
                for fld in tables["adjustFormat"]:
                    df = SparkUtilities.FormatColumn(df, fld["name"],
                                                     fld["inputFormat"])
            SparkUtilities.SaveParquet(df, self.fileUtilities)
            self.UploadFilesCreateAthenaTablesAndSqlScripts(
                tables, self.fileUtilities.parquet)
            self.fileUtilities.EmptyFolderContents(
                self.fileUtilities.csvFolder)
            if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y":
                self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in ProcessTables")
            raise