def process(self): Logger.init() # 录入参数: parsMap = self.getFormatParameter() # 执行控制 self.dwSqlServcie = DwSql() # 日志类型 logType = parsMap.get('logType') if (logType == None or logType == '') : # APP 主题 LOG self.dwAppAccessLogStatus = self.dwAppAccessLog(parsMap) self.dwAppActionDetailLogStatus = self.dwAppActionDetailLog(parsMap) # WEB 主题 Log self.dwWebVisitTrafficLogStatus = self.dwWebVisitTrafficLog(parsMap) self.dwWebActionDetailLogStatus = self.dwWebActionDetailLog(parsMap) elif (logType == 'dw_app_access_log' ): self.dwAppAccessLogStatus = self.dwAppAccessLog(parsMap) elif (logType == 'dw_app_action_detail_log' ): self.dwAppActionDetailLogStatus = self.dwAppActionDetailLog(parsMap) elif (logType == 'dw_web_visit_traffic_log' ): self.dwWebVisitTrafficLogStatus = sself.dwWebVisitTrafficLog(parsMap) elif (logType == 'dw_web_action_detail_log' ): self.dwWebActionDetailLogStatus = self.dwWebActionDetailLog(parsMap)
def runTable(self, q, threadPoolNum): wNum = 1 while (True): # 队列为空的时候退出 if (q.empty() == True): break # 当前可消费的队列 qTableInfo = q.get() sourceTb = str( qTableInfo['id'] ) + ': ' + qTableInfo['db_name'] + '.' + qTableInfo['tb_name'] Logger.info('线程:' + str(threadPoolNum) + ', 第: ' + str(wNum) + ' 次-> ' + str(sourceTb)) # 执行抽取任务 self.extractMysqlTable(qTableInfo) #hiveTbName = "test." + qTableInfo['db_name'] + "__" + qTableInfo['tb_name'] #print hiveTbName #print self.hiveModel.dropTable(hiveTbName) q.task_done() wNum += 1
def gatherTable(self, dbName, tbName, date): try: extractConf = self.getRegisterInstance( 'confModel').getExtractConf() confSeparator = extractConf['core']['separator'] confSourceDb = extractConf['gather_table']['hive_source_db'] confTargetDb = extractConf['gather_table']['hive_target_db'] # 设置 gather 表规则 sourceDb = confSourceDb sourceTable = dbName + confSeparator + tbName targetDb = confTargetDb targetTable = dbName + confSeparator + tbName gatherTable = GatherTable() gatherTable.setSourceDb(sourceDb) gatherTable.setSourceTable(sourceTable) gatherTable.setTargetDb(targetDb) gatherTable.setTargetTable(targetTable) gatherTable.setPartitionDate(date) gatherTable.run() except Exception, ex: log = "异常-> " + str(sourceDb) + "." + str(sourceTable) log += " -> " + str(Exception) + ":" + str(ex) Logger.info(log)
def process(self): Logger.init() pars = self.getFormatParameter() u'日期' parsDate = pars.get('date') if (parsDate != None): if (parsDate == "today"): date = self.getRegisterInstance('dateModel').getToday() elif (parsDate == "tomorrow"): date = self.getRegisterInstance('dateModel').getTomorrow() elif (parsDate == "yesterday"): date = self.getRegisterInstance('dateModel').getYesterday() else: date = parsDate else: u'默认是昨天日期 ,格式: 20151010' date = self.getRegisterInstance('dateModel').getYesterday() print date self.schedulerJob(date) self.minireportJob(date) self.hiveMetadata(date) self.dataWarehouse(date) self.hdfsMonitor(date)
def extractMysqlTableThread(self): # 等待运行数据表 sourceTableList = self.getRegisterInstance( 'biDbModel').getExtractMysqlTables(ExtractMysql.COMPLETE) #sourceTableList = self.getRegisterInstance('biDbModel').getExtractMysqlTables(ExtractMysql.INCREMENTAL) # 运行线程数 numThreadPool = 4 # 定义队列 q = Queue.Queue() # 加入到队列中 for curTableInfo in sourceTableList: q.put(curTableInfo) # 开指定数量线程消费 for curThreadPoolNum in range(numThreadPool): currentThread = threading.Thread(target=self.runTable, args=(q, curThreadPoolNum)) # True:父进程不等待子进程结束, 继续执行 # False:父进程会等待所有子进程执行完毕,父进程才会退出 currentThread.setDaemon(True) currentThread.start() # 等到队列为空,再向下执行 q.join() Logger.info('执行完成~') return True
def extractMysqlTableThread(self): # 等待运行数据表 sourceTableList = self.getRegisterInstance( 'biDbModel').getExtractMysqlTables() # 运行线程数 numThreadPool = 2 # 定义队列 q = Queue.Queue() # 加入到队列中 for curTableInfo in sourceTableList: q.put(curTableInfo) # 开指定数量线程消费 for curThreadPoolNum in range(numThreadPool): currentThread = threading.Thread(target=self.runTable, args=(q, curThreadPoolNum)) # 父进程不等待子进程结束,继续执行 currentThread.setDaemon(True) currentThread.start() sleep(5) # 等到队列为空,再向下执行 q.join() Logger.info('执行完成~')
def init(self): DwServiceCore.init(self) Logger.init() self.ubaSqlPath = self.getDwCoreInstance().SystemPath( 'basePath') + '/uba_log/uba_sql'
def runDwSqlProcess(self, parsMap): status = False try : Logger.init() sqlFile = parsMap.get('sql') # sql file 文件 if (sqlFile == None): Logger.info("sql 仓库文件不存在") exit(1) # 日期 parsDate = parsMap.get('date') if (parsDate != None ): if (parsDate == "today") : date = self.getRegisterInstance('dateModel').getToday() elif (parsDate == "tomorrow") : date = self.getRegisterInstance('dateModel').getTomorrow() elif (parsDate == "yesterday") : date = self.getRegisterInstance('dateModel').getYesterday() else : date = parsDate else : # 默认是昨天日期 ,格式: 20151010 date = self.getRegisterInstance('dateModel').getYesterday() # 服务器类型 serverType = parsMap.get('serverType') if (serverType == None ): Logger.info("serverType : hive or spark") exit(1) # 是否是 dw 仓库的 sql 文件 isDwSql = parsMap.get("isDwSql") # 读取 sql 文件内容,并且格式化好时间 if (isDwSql == None): sqlContent = self.getDwSqlContent(parsMap.get('sql'), date) elif(isDwSql == "yes"): sqlContent = self.getDwSqlContent(parsMap.get('sql'), date) elif(isDwSql == "no"): sqlContent = self.getSqlContent(parsMap.get('sql'), date) else: Logger.info("isDwSql 参数: [yes|no]") exit(1) if (serverType == 'hive'): status = self.runSqlByHive(sqlContent, parsMap.get('runEnv')) elif (serverType == 'spark'): status = self.runSqlBySpark(sqlContent) except Exception,ex: log = "异常存储过程: " log += " -> " + str(Exception) + ":" + str(ex) Logger.info(log)
def initGatherTable(self): # 开始时间 startTimestamp = self.getRegisterInstance('dateModel').getTimestamp() # 1. 根据源数据表,创建一张 gather table # 获取源数据表字段 sourceTableFields = self.getRegisterInstance('hiveModel').getFileds( self.getSourceDb(), self.getSourceTable()) formatTableFieldsList = [] for curField in sourceTableFields: formatTableFieldsList.append('`' + curField + '`') formatTableFieldsStr = ' String,'.join( formatTableFieldsList) + " String" createHiveTableSql = ''' CREATE TABLE IF NOT EXISTS %s.%s ( %s ) PARTITIONED BY ( `p_dt` String ) STORED AS ORC ''' % (self.getTargetDb(), self.getTargetTable(), formatTableFieldsStr) # 创建数据表 createHiveTableResult = self.getRegisterInstance( 'hiveModel').createTable(createHiveTableSql) # 2. 导入数据到 gather 表 insertSql = ''' INSERT OVERWRITE TABLE `%(gatherTable)s` PARTITION (`p_dt` = '%(partitionDate)s') SELECT * FROM %(sourceTable)s; ''' % { 'gatherTable': self.getTargetDb() + '.' + self.getTargetTable(), 'partitionDate': self.getPartitionDate(), 'sourceTable': self.getSourceDb() + '.' + self.getSourceTable() } # 执行导入 insertResult = self.getRegisterInstance('hiveModel').batchExecuteSql( insertSql) # 3. 检测执行结果 if (createHiveTableResult == True and insertResult == True): resultCode = 0 else: resultCode = 1 # 4. 计算执行结果写日志和打印 # 计算结束日期 diffTimestamp = self.getRegisterInstance( 'dateModel').getTimestamp() - startTimestamp # mysql 记录日志 self.extractLog(resultCode, diffTimestamp) # 打印日志 logStr = "(初始化 : " + str(self.getSourceDb()) + "." + str( self.getSourceTable()) + " -> " + str( self.getTargetDb()) + "." + str(self.getTargetTable( )) + " Time : " + str(diffTimestamp) + ")" Logger.info(logStr)
def shutdown(self): Logger.info("执行结果 access_log : " + str(self.accessLogStatus)) Logger.info("执行结果 dw_access_log : " + str(self.dwAccessLogStatus)) Logger.info("执行结果 uba_app_action_log : " + str(self.ubaAppActionLogStatus)) Logger.info("执行结果 uba_web_visit_log : " + str(self.ubaWebVisitLogStatus)) Logger.info("执行结果 uba_web_action_log : " + str(self.ubaWebActionLogStatus))
def sourceTableToGatherTable(self): # 开始时间 startTimestamp = self.getRegisterInstance('dateModel').getTimestamp() # 1. 获取表结构 sourceTableFields = self.getRegisterInstance('hiveModel').getFileds( self.getSourceDb(), self.getSourceTable()) gatherTableFields = self.getRegisterInstance('hiveModel').getFileds( self.getTargetDb(), self.getTargetTable()) # 2. 格式化需要导入到 gather table 的字段 fieldSql = '' for curGatherField in gatherTableFields: if (curGatherField == 'p_dt'): continue if (curGatherField in sourceTableFields): fieldSql += '`' + curGatherField + '`,' else: fieldSql += "'' AS " + '`' + curGatherField + '`,' # 祛除最后的逗号 formatFieldSql = fieldSql[:-1] # 3. 拼接 SQL gatherTableSql = ''' INSERT OVERWRITE TABLE `%(gatherTable)s` PARTITION (`p_dt` = '%(partitionDate)s') SELECT %(fieldSql)s FROM %(sourceTable)s; ''' % { 'gatherTable': self.getTargetDb() + '.' + self.getTargetTable(), 'partitionDate': self.getPartitionDate(), 'fieldSql': formatFieldSql, 'sourceTable': self.getSourceDb() + '.' + self.getSourceTable() } # 执行 SQL gatherTableResult = self.getRegisterInstance( 'hiveModel').batchExecuteSql(gatherTableSql) # 4. 检测执行结果 if (gatherTableResult == True): resultCode = 0 else: resultCode = 1 # 5. 计算执行结果写日志和打印 # 计算结束日期 diffTimestamp = self.getRegisterInstance( 'dateModel').getTimestamp() - startTimestamp # mysql 记录日志 self.extractLog(resultCode, diffTimestamp) # 打印日志 logStr = "(执行聚合 : " + str(self.getSourceDb()) + "." + str( self.getSourceTable()) + " -> " + str( self.getTargetDb()) + "." + str(self.getTargetTable( )) + " Time : " + str(diffTimestamp) + ")" Logger.info(logStr)
def run(self): Logger.init() # 抽取类型(全量、增量) curExtractType = self.getExtractType() # 全量抽取 if (curExtractType == ExtractMysql.COMPLETE): self.extractCompleteAction() # 增量抽取 elif (curExtractType == ExtractMysql.INCREMENTAL): self.extractIncrementalAction()
def run(self): Logger.init() # 目标数据库不存在 if (self.isExistsGatherTable() == False): # 初始化数据表 self.initGatherTable() else: sourceTableNewFields = self.getSourceTableNewFields() # 如果 source table 有新增的字段 if (len(sourceTableNewFields) > 0): # 增加新的字段 self.alterGatherTableField(sourceTableNewFields) # 导入 self.sourceTableToGatherTable() else: # 导入 self.sourceTableToGatherTable()
def hdfsMonitor(self, date): rs = self.getHdfsMB() self.modifyIndicatorSystem(date, 'hadoop_data', rs.get('dataMb')) self.modifyIndicatorSystem(date, 'hadoop_hdfs', rs.get('hdfsMb')) # 输出日志 Logger.info("------------------------------") Logger.info("hadoop_data: " + str(rs.get('dataMb'))) Logger.info("hadoop_hdfs: " + str(rs.get('hdfsMb'))) Logger.info("------------------------------")
def process(self): Logger.init() # 解析参数 parameter = self.getFormatParameter() dbName = parameter.get('dbName') tbName = parameter.get('tbName') date = parameter.get('date') if (date == None): date = self.getRegisterInstance('dateModel').getYesterdayByYmd() # snapshot 指定数据表 if (dbName != None and tbName != None): self.snapshotTable(dbName,tbName,date) # snapshot 所有表 else: self.snapshotTableAll(date)
def runTable(self, q, threadPoolNum): wNum = 1 while (True): # 队列为空的时候退出 if (q.empty() == True): break # 当前可消费的队列 qTableInfo = q.get() sourceTb = qTableInfo['db_name'] + '.' + qTableInfo[ 'tb_name'] + '.' + str(qTableInfo['id']) Logger.info('线程:' + str(threadPoolNum) + ', 第: ' + str(wNum) + ' 次. ' + str(sourceTb)) # 执行抽取任务 self.extractMysqlTable(qTableInfo) q.task_done() wNum += 1
def extractIncrementalAction(self): # 检测 hive 数据表是否存在 if (self.isExistsHiveTable() == False): Logger.info("增量抽取控制: 初始化, 全量抽取... -> " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable())) # 不存在全量抽取 self.extractCompleteAction() else: # 检测字段变化 # 有变化时 if (self.checkStbAndTtbFields() == True): Logger.info("增量抽取控制: 结构发生变化, 初始化, 全量抽取... -> " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable())) # 全量抽取 self.extractCompleteAction() # 无变化 else: Logger.info("增量抽取控制: 增量抽取... -> " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable())) # 增量抽取 self.extractIncrementalTable()
def snapshotTable(self,dbName,tbName,date): try : extractConf = self.getRegisterInstance('confModel').getExtractConf() confSeparator = extractConf['core']['separator'] confSourceDb = extractConf['snapshot_table']['hive_source_db'] confTargetDb = extractConf['snapshot_table']['hive_target_db'] # 设置 snapshot表规则 sourceDb = confSourceDb sourceTable = dbName + confSeparator + tbName targetDb = confTargetDb targetTable = dbName + confSeparator + tbName + '_' + date.replace('-','') # 开始时间 startTimestamp = self.getRegisterInstance('dateModel').getTimestamp() # 1.生成sql createHiveTableSql = ''' DROP TABLE IF EXISTS %(snapshotTbl)s; CREATE TABLE IF NOT EXISTS %(snapshotTbl)s AS SELECT * FROM %(srcTbl)s;'''%{'srcTbl':sourceDb + '.' + sourceTable, 'snapshotTbl': targetDb + '.' + targetTable} Result = self.getRegisterInstance('hiveModel').batchExecuteSql(createHiveTableSql) # 2.计算执行结果写日志和打印 diffTimestamp = self.getRegisterInstance('dateModel').getTimestamp() - startTimestamp logSql = "INSERT INTO dw_service.snapshot_log (`source_db`,`source_table`,`target_db`,`target_table`,`code`,`run_time`) " logSql += "VALUES ('%s','%s','%s','%s',%d,%d)" logSql = logSql%(sourceDb,sourceTable,targetDb,targetTable,int(Result),diffTimestamp) self.getRegisterInstance('biDbModel').insertData(logSql) # 打印日志 logStr = str(sourceDb) + "." + str(sourceTable) + " -> " + str(targetDb) + "." + str(targetTable) + " Time : " + str(diffTimestamp) + ")" Logger.info(logStr) except Exception,ex: log = "异常-> " + str(sourceDb) + "." + str(sourceTable) log += " -> " + str(Exception) + ":" + str(ex) Logger.info(log)
def process(self): Logger.init() # 解析参数 parameter = self.getFormatParameter() # 运行类型 runType = parameter.get('runType') # 串行抽取 if (runType == 'liste'): self.extractMysqlTableListe() # 并行抽取 elif (runType == 'thread'): self.extractMysqlTableThread() # 指定抽取数据表抽取 elif (parameter.get('sourceDb') != None and parameter.get('sourceTb') != None): self.extractMysqlTableIndependent(parameter) # 测试 else: self.extractMysqlTableTest()
def extractMysqlSqoop(self): Logger.info("---------- sqoop 开始 " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable()) + " ----------") # 开始时间 startTimestamp = self.getRegisterInstance('dateModel').getTimestamp() # 1. 删除目标 Hive 对应表 Logger.info("删除目标 Hive 对应表") self.getRegisterInstance('hiveModel').dropTable(self.getTargetDb() + "." + self.getTargetTable()) # 2. 执行 Sqoop Mysql 导入到 Hive Logger.info("执行 Sqoop Mysql 导入到 Hive") self.getRegisterInstance('sqoopModel').setDbServer(self.getExtractDb()) self.getRegisterInstance('sqoopModel').setSourceDb(self.getSourceDb()) self.getRegisterInstance('sqoopModel').setSourceTable( self.getSourceTable()) self.getRegisterInstance('sqoopModel').setTargetDb(self.getTargetDb()) self.getRegisterInstance('sqoopModel').setTargetTable( self.getTargetTable()) self.getRegisterInstance('sqoopModel').setMapReduceNum( self.getMapReduceNum()) result = self.getRegisterInstance('sqoopModel').importMysqlToHive() # 3. 检测执行结果 if (result['code'] == 0): resultCode = 0 else: resultCode = result['code'] # 4. 计算执行结果写日志和打印 # 计算结束日期 diffTimestamp = self.getRegisterInstance( 'dateModel').getTimestamp() - startTimestamp #mysql 记录日志 self.extractLog(resultCode, diffTimestamp) # 打印日志 logStr = "全量抽取 : (Sqoop : " + str(self.getTbId()) + ': ' + str( self.getSourceDb()) + "." + str(self.getSourceTable( )) + " -> " + str(self.getTargetDb()) + "." + str( self.getTargetTable()) + " Time : " + str(diffTimestamp) + ")" Logger.info(logStr)
def runSqlBySpark(self, sqlContent): Logger.info(sqlContent) Logger.info("执行中.....") u'提交到 spark jdbc 执行' status = self.getRegisterInstance('sparkModel').batchExecuteSql(sqlContent) u'打印日志' Logger.info(self.runLog("运行结果",status)) return status
def schedulerJob(self, date): sc = SchedulerJob() # 跑存储过程 sc.storedProcedure(date) # job 总数量 jobCount = sc.getJobCount(date) etlJobCnStatus = self.modifyIndicatorSystem(date, 'etl_job_cn', jobCount) # job 总平均运行时间 jobAvg = sc.getJobTotalAvgTime(date) etlJobRunAvgStatus = self.modifyIndicatorSystem( date, 'etl_job_run_avg', jobAvg) # 输出日志 Logger.info("------------------------------") Logger.info("etl_job_cn: " + str(jobCount) + " ," + str(etlJobCnStatus)) Logger.info("etl_job_run_avg: " + str(jobAvg) + " ," + str(etlJobRunAvgStatus)) Logger.info("------------------------------")
def hiveMetadata(self, date): hm = HiveMetadata() # 跑存储过程 hm.storedProcedure(date) # hive 数据表总条数 hive_table_cn = hm.getHiveTableCount(date) hiveTableCnStatus = self.modifyIndicatorSystem(date, 'hive_table_cn', hive_table_cn) # 输出日志 Logger.info("------------------------------") Logger.info("hive_table_cn: " + str(hive_table_cn) + " ," + str(hiveTableCnStatus)) Logger.info("------------------------------")
def minireportJob(self, date): mr = MinireportJob() # 跑存储过程 mr.storedProcedure(date) # minireprot 总数量 minireportCount = mr.getMinireportCount(date) minireportCnStatus = self.modifyIndicatorSystem( date, 'minireport_cn', minireportCount) # minireprot总平均运行时间 minireportAvg = mr.getMinireportTotalAvgTime(date) minireportRunAvgStatus = self.modifyIndicatorSystem( date, 'minireport_run_avg', minireportAvg) # 输出日志 Logger.info("------------------------------") Logger.info("minireport_cn: " + str(minireportCount) + " ," + str(minireportCnStatus)) Logger.info("minireport_run_avg: " + str(minireportAvg) + " ," + str(minireportRunAvgStatus)) Logger.info("------------------------------")
def runSqlByHive(self, sqlContent, runEnv): Logger.info(sqlContent) Logger.info("执行中.....") status = False # 运行环境 if (runEnv == "local"): u'提交到 Hive 本地 执行' status = self.getRegisterInstance('hiveModel').runHiveScript(sqlContent) u'提交到 Hive jdbc 执行' elif (runEnv == "hiveserver2"): status = self.getRegisterInstance('hiveModel').batchExecuteSql(sqlContent) else : status = self.getRegisterInstance('hiveModel').batchExecuteSql(sqlContent) u'打印日志' Logger.info(self.runLog("运行结果",status)) return status
def extractIncrementalTable(self): Logger.info("---------- 增量抽取开始 " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable()) + " ----------") # 开始时间 startTimestamp = self.getRegisterInstance('dateModel').getTimestamp() # 设置增量表属性 Logger.info("设置增量表属性") tableInfoExt = self.getRegisterInstance( 'biDbModel').getExtractMysqlTableExt(self.getTbId()) self.setIncrementalAttribute(tableInfoExt) # 获取增量表的属性 Logger.info("获取增量表的属性") incTbAttr = self.getIncrementalAttribute() primaryKey = incTbAttr['primary_key'] incrementalField = incTbAttr['incremental_field'] incrementalVal = incTbAttr['incremental_val'] conditions = incTbAttr['conditions'] # hive 目标表 targetTb = self.getTargetDb() + "." + self.getTargetTable() # hive 增量表 incTb = targetTb + "__inc" # 1. 删除增量抽取表 Logger.info("删除增量抽取表") self.getRegisterInstance('hiveModel').dropTable(incTb) # 2. 创建增量抽取表 Logger.info("创建增量抽取表") createHiveTableSql = "CREATE TABLE " + incTb + " LIKE " + targetTb createHiveTableResult = self.getRegisterInstance( 'hiveModel').createTable(createHiveTableSql) incDumpSql = "" # 3. 读取最新的增量数据到本地文件中 if (incrementalVal == ""): # 获取目标表,最大的字段数, 已这个作为基地抽取数据 Logger.info("获取目标表,最大的字段数, 已这个作为基地抽取数据") targetTbMaxPointVal = self.getHiveTbMaxVal(targetTb, incrementalField) incDumpSql = "SELECT * FROM " + self.getSourceDb( ) + "." + self.getSourceTable( ) + " WHERE " + incrementalField + conditions + "'" + str( targetTbMaxPointVal) + "'" # 更新 point 点 Logger.info("更新 point 点") self.updateTableExt(tableInfoExt['id'], targetTbMaxPointVal) else: incDumpSql = "SELECT * FROM " + self.getSourceDb( ) + "." + self.getSourceTable( ) + " WHERE " + incrementalField + conditions + "'" + incrementalVal + "'" Logger.info("dump 更新数据到本地") dumpIncFile = self.getDumpFileDir() + "/" + self.getDumpFileName( ) + ".inc" dumpIncResult = self.extractDbServerModel.mysqlDumpFile( incDumpSql, dumpIncFile) # 4. 上传 dump 文件到 incHiveTable 中 Logger.info("上传 dump 文件到 incHiveTable 中") hiveLoadSql = "LOAD DATA LOCAL INPATH '" + dumpIncFile + "' OVERWRITE INTO TABLE " + incTb + ";" hiveLoadResult = self.getRegisterInstance('hiveModel').runHiveScript( hiveLoadSql) # 获取增量表本次最大一条增量的增量字段值 Logger.info("获取增量表本次最大一条增量的增量字段值") incTbMaxPointVal = self.getHiveTbMaxVal(incTb, incrementalField) # 当抽取的增量数据为空时, 不做任何处理, 直接退出 if (incTbMaxPointVal == None): Logger.info("增量数据为空...") return # 5. 执行存储过程完成增量 incHiveSql = """ INSERT OVERWRITE TABLE %(targetTb)s SELECT * FROM ( SELECT a.* FROM %(targetTb)s AS a LEFT JOIN %(incTb)s AS b ON a.%(primaryKey)s = b.%(primaryKey)s WHERE b.%(primaryKey)s IS NULL ) AS bs UNION ALL SELECT * FROM %(incTb)s ;""" % { 'targetTb': targetTb, 'incTb': incTb, 'primaryKey': primaryKey } Logger.info(incHiveSql) # 6. 最终逻辑运算使用 spark sql Logger.info("最终逻辑运算使用 spark sql") incSqlResult = self.getRegisterInstance('sparkModel').batchExecuteSql( incHiveSql) incSqlResult = True # 7. 检测执行结果 if (dumpIncResult['code'] == 0 and hiveLoadResult['code'] == 0 and createHiveTableResult == True and incSqlResult == True): # 更新节点 if (incTbMaxPointVal != None): self.updateTableExt(tableInfoExt['id'], incTbMaxPointVal) resultCode = 0 else: resultCode = 1 # 计算执行结果写日志和打印 # 计算结束日期 diffTimestamp = self.getRegisterInstance( 'dateModel').getTimestamp() - startTimestamp # mysql 记录日志 self.extractLog(resultCode, diffTimestamp) # 打印日志 logStr = "增量抽取 : (Dump : " + str(self.getTbId()) + ': ' + str( self.getSourceDb()) + "." + str(self.getSourceTable( )) + " -> " + str(self.getTargetDb()) + "." + str( self.getTargetTable()) + " Time : " + str(diffTimestamp) + ")" Logger.info(logStr)
def extractMysqlTable(self, tableInfo): try: extractConf = self.getRegisterInstance( 'confModel').getExtractConf() # 默认表分隔符 confSeparator = extractConf['core']['separator'] # 抽取到目标 hive 数据库名 confTargetDb = extractConf['extract_mysql']['hive_target_db'] # dump 本地临时目录 confDumpFileDir = extractConf['extract_mysql']['dump_file_dir'] # 数据库配置表信息 # tb id tbId = tableInfo['id'] # mysql 数据库源信息 dbServer = tableInfo['db_server'] # 数据源: 数据库名 dbName = tableInfo['db_name'] # 数据源: 表名 tbName = tableInfo['tb_name'] # 目标 hive 数据库名 dbTargetDbName = tableInfo['target_db_name'] # 目标 hive 表名 dbTargetTbName = tableInfo['target_tb_name'] # 抽取工具 extractTool = tableInfo['extract_tool'] # 抽取类型 extractType = tableInfo['extract_type'] # 设置 hive 表名的规则 # 当指定了抽取的目标库, 使用指定的库和表名 if (dbTargetDbName != "" and dbTargetTbName != ""): affirmTargetDb = dbTargetDbName affirmTargetTb = dbTargetTbName # 没有使用则用默认的规则 else: affirmTargetDb = confTargetDb affirmTargetTb = dbName + confSeparator + tbName # 实例化抽取对象 extractMysql = ExtractMysql() # Dump 方式时的保存目录 extractMysql.setDumpFileDir(confDumpFileDir) # 设置抽取的数据库源 if (dbServer == ExtractMysql.PRODUCE_DB): extractMysql.setExtractDb(ExtractMysql.PRODUCE_DB) elif (dbServer == ExtractMysql.DW_DB): extractMysql.setExtractDb(ExtractMysql.DW_DB) else: Logger.info("抽取的数据源不存在!" + dbServer) # 设置抽取类型 # 全量抽取 if (extractType == ExtractMysql.COMPLETE): extractMysql.setExtractType(ExtractMysql.COMPLETE) # 增量抽取 elif (extractType == ExtractMysql.INCREMENTAL): extractMysql.setExtractType(ExtractMysql.INCREMENTAL) else: Logger.info("抽取数据类型不存在!" + extractType) # 配置指定抽取的工具 if (extractTool == ExtractMysql.MYSQL_DUMP): extractMysql.setExtractTool(ExtractMysql.MYSQL_DUMP) elif (extractTool == ExtractMysql.SQOOP): extractMysql.setExtractTool(ExtractMysql.SQOOP) extractMysql.setMapReduceNum(5) # 设置抽取表的信息 sourceDb = dbName sourceTable = tbName targetDb = affirmTargetDb targetTable = affirmTargetTb extractMysql.setTbId(tbId) extractMysql.setSourceDb(sourceDb) extractMysql.setSourceTable(sourceTable) extractMysql.setTargetDb(targetDb) extractMysql.setTargetTable(targetTable) extractMysql.run() except Exception, ex: log = "异常-> 数据表: " + str(dbServer) + ": " + str( dbName) + "." + str(tbName) log += " -> " + str(Exception) + ":" + str(ex) Logger.info(log)
def extractMysqlDump(self): Logger.info("---------- mysqlDump 开始 " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable()) + " ---------- ") # 开始时间 startTimestamp = self.getRegisterInstance('dateModel').getTimestamp() # 1. 删除目标 Hive 对应表 Logger.info("删除目标 Hive 对应表") self.getRegisterInstance('hiveModel').dropTable(self.getTargetDb() + "." + self.getTargetTable()) # 2. dump mysql 数据到文件中 Logger.info("dump mysql 数据到文件中") dumpSql = "SELECT * FROM " + self.getSourceDb( ) + "." + self.getSourceTable() dumpFile = self.getDumpFileDir() + "/" + self.getDumpFileName() dumpResult = self.extractDbServerModel.mysqlDumpFile(dumpSql, dumpFile) # 3. 根据 Mysql 表结构创建 Hive 表结构 Logger.info("根据 Mysql 表结构创建 Hive 表结构") sourceTableFields = self.getSourceTableFields() formatTableFieldsList = [] for curField in sourceTableFields: formatTableFieldsList.append('`' + curField + '`') formatTableFieldsStr = ' String,'.join( formatTableFieldsList) + " String" createHiveTableSql = ''' CREATE TABLE IF NOT EXISTS %s.%s ( %s ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\001' COLLECTION ITEMS TERMINATED BY '\\n' STORED AS TEXTFILE ''' % (self.getTargetDb(), self.getTargetTable(), formatTableFieldsStr) # 执行 Hive 创建表 Logger.info("执行 Hive 创建表") createHiveTableResult = self.getRegisterInstance( 'hiveModel').createTable(createHiveTableSql) # 4. 上传 dump 文件到 HiveTable 中 Logger.info("上传 dump 文件到 HiveTable 中") hiveLoadSql = "LOAD DATA LOCAL INPATH '" + self.getDumpFileDir( ) + "/" + self.getDumpFileName( ) + "' OVERWRITE INTO TABLE " + self.getTargetDb( ) + "." + self.getTargetTable() + ";" hiveLoadResult = self.getRegisterInstance('hiveModel').runHiveScript( hiveLoadSql) # 5. 检测执行结果 if (dumpResult['code'] == 0 and createHiveTableResult == True and hiveLoadResult['code'] == 0): resultCode = 0 else: resultCode = hiveLoadResult['code'] # 6. 计算执行结果写日志和打印 # 计算结束日期 diffTimestamp = self.getRegisterInstance( 'dateModel').getTimestamp() - startTimestamp # mysql 记录日志 self.extractLog(resultCode, diffTimestamp) # 打印日志 logStr = "全量抽取 : (Dump : " + str(self.getTbId()) + ': ' + str( self.getSourceDb()) + "." + str(self.getSourceTable( )) + " -> " + str(self.getTargetDb()) + "." + str( self.getTargetTable()) + " Time : " + str(diffTimestamp) + ")" Logger.info(logStr)
def dataWarehouse(self, date): # 获取昨天日期,因为每次统一的是当前天数昨天的日期 offsetDay = self.getRegisterInstance('dateModel').getOffsetDateDay( date, -1) # dw_web_visit_traffic_log 数据表统计 dwWebVisitTrafficLogCn = self.getTableCountForDate( 'dw_db.dw_web_visit_traffic_log', offsetDay) dwWebVisitTrafficLogCnStatus = self.modifyIndicatorSystem( date, 'dw_web_visit_traffic_log', dwWebVisitTrafficLogCn) # dw_web_action_detail_log 数据表统计 dwWebActionDetailLogCn = self.getTableCountForDate( 'dw_db.dw_web_action_detail_log', offsetDay) dwWebActionDetailLogCnStatus = self.modifyIndicatorSystem( date, 'dw_web_action_detail_log', dwWebActionDetailLogCn) # dw_app_access_log 数据表统计 dwAppAccessLogCn = self.getTableCountForDate('dw_db.dw_app_access_log', offsetDay) dwAppAccessLogCnStatus = self.modifyIndicatorSystem( date, 'dw_app_access_log', dwAppAccessLogCn) # dw_app_action_detail_log 数据表统计 dwAppActionDetailLogCn = self.getTableCountForDate( 'dw_db.dw_app_action_detail_log', offsetDay) dwAppActionDetailLogCnStatus = self.modifyIndicatorSystem( date, 'dw_app_action_detail_log', dwAppActionDetailLogCn) # dw_property_inventory_sd 数据表统计 dwPropertyInventorySdCn = self.getTableCountForDate( 'dw_db.dw_property_inventory_sd', offsetDay) dwPropertyInventorySdCnStatus = self.modifyIndicatorSystem( date, 'dw_property_inventory_sd', dwPropertyInventorySdCn) # 输出日志 Logger.info("------------------------------") Logger.info(str(date) + " " + str(offsetDay)) Logger.info("dw_web_visit_traffic_log: " + str(dwWebVisitTrafficLogCn) + " ," + str(dwWebVisitTrafficLogCnStatus)) Logger.info("dw_web_action_detail_log: " + str(dwWebActionDetailLogCn) + " ," + str(dwWebActionDetailLogCnStatus)) Logger.info("dw_app_access_log: " + str(dwAppAccessLogCn) + " ," + str(dwAppAccessLogCnStatus)) Logger.info("dw_app_action_detail_log: " + str(dwAppActionDetailLogCn) + " ," + str(dwAppActionDetailLogCnStatus)) Logger.info("dw_property_inventory_sd: " + str(dwPropertyInventorySdCn) + " ," + str(dwPropertyInventorySdCnStatus)) Logger.info("------------------------------")
def shutdown(self): Logger.info("执行结果 dw_app_access_log : " + str(self.dwAppAccessLogStatus)) Logger.info("执行结果 dw_app_action_detail_log : " + str(self.dwAppActionDetailLogStatus)) Logger.info("执行结果 dw_web_visit_traffic_log : " + str(self.dwWebVisitTrafficLogStatus)) Logger.info("执行结果 dw_web_action_detail_log : " + str(self.dwWebActionDetailLogStatus))