def StageCodeBookFile(self): try: self.logger.info("Starting staging codebook file") codebookHeaderRow = self.config.get( 'CodeBook', str(self.year) + self.responseClass + "header") codebookSheetName = self.config.get( 'CodeBook', str(self.year) + self.responseClass + "sheet") codebookDict = pd.read_excel(self.codeBookFile, index_col=None, header=int(codebookHeaderRow), sheet_name=codebookSheetName) codebookDF = pd.DataFrame.from_dict(codebookDict) self.logger.info("Starting transformation of codebook.") codebookDF = self.AddCodesForMissingData(codebookDF) codebookDF = codebookDF.drop_duplicates() self.cbdf = codebookDF self.logger.info( "Starting insertion of codebook file data into staging table.") with SurveyDatabase.surveyDatabase() as db: db.createStagingTableFromDF( self.cbdf, 'codebook_' + self.responseClass + str(self.year)) self.logger.info( "Finished insertion of codebook file data into staging table.") except Exception as e: self.logger.error(e.args[0]) raise
def initLogging(): config = SurveyConfigReader.surveyConfig() LogToDB = config.get('Logging', 'LogToDB') #DEBUG - Detailed information, typically of interest only when diagnosing problems. #INFO - Confirmation that things are working as expected. #WARNING - An indication that something unexpected happened, or indicative of some problem in the near future (e.g. ‘disk space low’). The software is still working as expected. #ERROR - Due to a more serious problem, the software has not been able to perform some function. #CRITICAL - A serious error, indicating that the program itself may be unable to continue running. loggingLevel = logging.INFO for logger_name in ['surveyLogger']: logger = logging.getLogger(logger_name) logger.setLevel(loggingLevel) if LogToDB == 'True': sqlConn = SurveyDatabase.surveyDatabase().sql_conn cursor = sqlConn.cursor() ch = LogDBHandler(sqlConn, cursor, "stg.log") else: path = config.get('Logging', 'LoggingDirectory') ch = logging.FileHandler( path + 'Logging_' + datetime.datetime.now().strftime("%Y%m%d") + '.log') ch.setFormatter( logging.Formatter( '%(asctime)s - %(threadName)s %(filename)s:%(lineno)d - %(funcName)s() - %(levelname)s - %(message)s' )) ch.setLevel(loggingLevel) logger.addHandler(ch)
def TransformResponseAndCodeTable(self, rfdf, cbdf): #Foreach column in cbdf for header in list(rfdf.columns.values): rfdf = self.AddParseColumns(header, rfdf, cbdf) with SurveyDatabase.surveyDatabase() as db: db.createStagingTableFromDF(rfdf,'response_and_code_' + self.responseClass +'_'+str(self.year))
def StageResponseFile(self): try: self.logger.info("Starting staging response file") with SurveyDatabase.surveyDatabase() as db: #read in where the header row starts header_row = int( self.config.get( 'Response', str(self.year) + self.responseClass + "header")) self.logger.info("Setting header row to: " + str(header_row)) sheet_name = self.config.get( 'Response', str(self.year) + self.responseClass + "sheet") self.logger.info("Setting sheet name to: " + sheet_name) self.logger.info("Reading in file format for " + str(self.year)) readFormat = self.config.get( 'Response', str(self.year) + self.responseClass + "format") if readFormat == 'excel': response_sheet_name = self.config.get( 'Response', str(self.year) + self.responseClass + "sheet") self.logger.info("Setting sheet name to: " + response_sheet_name) self.logger.info('Reading in survey excel file from: ' + self.responseFile) rfdf = pd.read_excel(self.responseFile, index_col=None, header=header_row, sheet_name=response_sheet_name) elif readFormat == 'database': self.logger.info('Reading in survey from database table') rfdf = db.executeAndPandas("SELECT * FROM [stg].[" + self.responseFile + "]") elif readFormat == 'csv': self.logger.info('Reading in survey csv file from: ' + self.responseFile) rfdf = pd.read_csv(self.responseFile, index_col=None, header=header_row) else: raise Exception('Unknown Format Type') self.logger.info( "Starting insertion of response file data into staging table." ) db.createStagingTableFromDF(rfdf, 'Survey_file_' + str(self.year)) self.logger.info( "Finished insertion of response file data into staging table." ) self.rfdf = rfdf except Exception as e: self.logger.error(e.args[0]) raise
def ProcessHouseholdFactTable(self): try: with SurveyDatabase.surveyDatabase() as db: db.execute("exec HHSurvey.mergeHouseholdFact" + str(self.year)) return True except Exception as e: self.logger.error(e.args[0]) raise
def ProcessTripDim(self): try: with SurveyDatabase.surveyDatabase() as db: db.execute("exec HHSurvey.merge_trip_dim_" + str(self.year)) #upsert logic instead of sql logic except Exception as e: self.logger.error(e.args[0]) raise
def ProcessFactTable(self): try: with SurveyDatabase.surveyDatabase() as db: db.execute("exec HHSurvey.merge_" + self.responseClass + "_fact_" + str(self.year)) return True except Exception as e: self.logger.error(e.args[0]) raise
def LoadDims(self): try: self.logger.info("Starting loading {}_dim".format(self.responseClass)) sproc_name = 'HHSurvey.merge_{}_dim_{}'.format(self.responseClass, str(self.year)) with SurveyDatabase.surveyDatabase() as db: db.execute("exec {}".format(sproc_name)) self.logger.info("Finished loading {}_dim".format(self.responseClass)) except Exception as e: self.logger.error(e.args[0]) raise
def ProcessMapping(self, df): with SurveyDatabase.surveyDatabase() as db: self.logger.info("Pulling in the mapping table") mappingDF = db.pullMappingTable("File_"+ str(self.year)) #Convert to Dictionary mappingDict = mappingDF.set_index("Orginal_Names")["Master_Names"].to_dict() self.logger.info("Renaming column to master names") df = df.rename(index=str, columns=mappingDict) return True
def TransformResponseAndCodeTable(self, rfdf, cbdf): try: #Foreach column in cbdf for header in list(rfdf.columns.values): rfdf = self.AddParseColumns(header, rfdf, cbdf) with SurveyDatabase.surveyDatabase() as db: db.createStagingTableFromDF(rfdf,'response_and_code_'+self.responseClass+'_'+str(self.year)) except Exception as e: self.logger.error('header=' + header + e.args[0]) raise
def ProcessHouseHoldDim(self): try: with SurveyDatabase.surveyDatabase() as db: db.execute("exec HHSurvey.merge_household_dim_" + str(self.year)) #upsert logic instead of sql logic #TODO: delete existing year data #TODO format df to HouseholdDim structure #TODO: append data to dim table #appendTableFromDF('dbo',df, 'HouseholdDim'): return True except Exception as e: self.logger.error(e.args[0]) raise
def StageCodeBookFile(self): try: self.logger.info("Starting staging codebook file") with SurveyDatabase.surveyDatabase() as db: codebookHeaderRow = self.config.get( 'CodeBook', str(self.year) + self.responseClass + "header") codebookSheetName = self.config.get( 'CodeBook', str(self.year) + self.responseClass + "sheet") codebookDict = pd.read_excel(self.codeBookFile, index_col=None, header=int(codebookHeaderRow), sheet_name=codebookSheetName) codebookDF = pd.DataFrame.from_dict(codebookDict) self.logger.info( "Starting insertion of codebook file data into staging table." ) db.createStagingTableFromDF( codebookDF, 'codebook_' + self.responseClass + str(self.year)) self.logger.info( "Finished insertion of codebook file data into staging table." ) self.logger.info("Starting transformation of codebook.") codebookDF = codebookDF.replace('Valid Values', pd.np.nan) codebookDF = codebookDF.replace('Labeled Values', pd.np.nan) codebookDF[['order', 'Field']] = codebookDF[['order', 'Field' ]].fillna(method='ffill') if (self.year == '2017' and self.responseClass == 'trip'): codebookDF = self.AddSpecialCodeBookRows(codebookDF) self.cbdf = codebookDF[['Field', 'Variable', 'Value']] self.logger.info("Finished transformation of codebook.") except Exception as e: self.logger.error(e.args[0]) raise
def ProcessPersonDim(self): with SurveyDatabase.surveyDatabase() as db: db.execute("exec HHSurvey.merge_person_dim_" + str(self.year))