Ejemplo n.º 1
0
    def extractMysqlTableThread(self):
        # 等待运行数据表
        sourceTableList = self.getRegisterInstance(
            'biDbModel').getExtractMysqlTables()

        # 运行线程数
        numThreadPool = 2

        # 定义队列
        q = Queue.Queue()

        # 加入到队列中
        for curTableInfo in sourceTableList:
            q.put(curTableInfo)

        # 开指定数量线程消费
        for curThreadPoolNum in range(numThreadPool):
            currentThread = threading.Thread(target=self.runTable,
                                             args=(q, curThreadPoolNum))
            # 父进程不等待子进程结束,继续执行
            currentThread.setDaemon(True)
            currentThread.start()
            sleep(5)

        # 等到队列为空,再向下执行
        q.join()

        Logger.info('执行完成~')
Ejemplo n.º 2
0
    def runTable(self, q, threadPoolNum):

        wNum = 1
        while (True):
            # 队列为空的时候退出
            if (q.empty() == True):
                break

            # 当前可消费的队列
            qTableInfo = q.get()

            sourceTb = str(
                qTableInfo['id']
            ) + ': ' + qTableInfo['db_name'] + '.' + qTableInfo['tb_name']
            Logger.info('线程:' + str(threadPoolNum) + ', 第: ' + str(wNum) +
                        ' 次-> ' + str(sourceTb))

            # 执行抽取任务
            self.extractMysqlTable(qTableInfo)

            #hiveTbName =  "test." + qTableInfo['db_name'] + "__" + qTableInfo['tb_name']
            #print hiveTbName
            #print self.hiveModel.dropTable(hiveTbName)

            q.task_done()
            wNum += 1
Ejemplo n.º 3
0
    def gatherTable(self, dbName, tbName, date):
        try:
            extractConf = self.getRegisterInstance(
                'confModel').getExtractConf()

            confSeparator = extractConf['core']['separator']
            confSourceDb = extractConf['gather_table']['hive_source_db']
            confTargetDb = extractConf['gather_table']['hive_target_db']

            # 设置 gather 表规则
            sourceDb = confSourceDb
            sourceTable = dbName + confSeparator + tbName
            targetDb = confTargetDb
            targetTable = dbName + confSeparator + tbName

            gatherTable = GatherTable()
            gatherTable.setSourceDb(sourceDb)
            gatherTable.setSourceTable(sourceTable)
            gatherTable.setTargetDb(targetDb)
            gatherTable.setTargetTable(targetTable)
            gatherTable.setPartitionDate(date)
            gatherTable.run()

        except Exception, ex:
            log = "异常->  " + str(sourceDb) + "." + str(sourceTable)
            log += " -> " + str(Exception) + ":" + str(ex)
            Logger.info(log)
Ejemplo n.º 4
0
    def extractMysqlTableThread(self):

        # 等待运行数据表
        sourceTableList = self.getRegisterInstance(
            'biDbModel').getExtractMysqlTables(ExtractMysql.COMPLETE)
        #sourceTableList = self.getRegisterInstance('biDbModel').getExtractMysqlTables(ExtractMysql.INCREMENTAL)

        # 运行线程数
        numThreadPool = 4

        # 定义队列
        q = Queue.Queue()

        # 加入到队列中
        for curTableInfo in sourceTableList:
            q.put(curTableInfo)

        # 开指定数量线程消费
        for curThreadPoolNum in range(numThreadPool):
            currentThread = threading.Thread(target=self.runTable,
                                             args=(q, curThreadPoolNum))
            # True:父进程不等待子进程结束, 继续执行
            # False:父进程会等待所有子进程执行完毕,父进程才会退出
            currentThread.setDaemon(True)
            currentThread.start()

        # 等到队列为空,再向下执行
        q.join()

        Logger.info('执行完成~')

        return True
Ejemplo n.º 5
0
    def initGatherTable(self):
        # 开始时间
        startTimestamp = self.getRegisterInstance('dateModel').getTimestamp()

        # 1. 根据源数据表,创建一张 gather table
        # 获取源数据表字段
        sourceTableFields = self.getRegisterInstance('hiveModel').getFileds(
            self.getSourceDb(), self.getSourceTable())
        formatTableFieldsList = []
        for curField in sourceTableFields:
            formatTableFieldsList.append('`' + curField + '`')
        formatTableFieldsStr = ' String,'.join(
            formatTableFieldsList) + " String"

        createHiveTableSql = '''
CREATE TABLE IF NOT EXISTS  %s.%s (
%s
) PARTITIONED BY  (
  `p_dt` String
)
STORED AS ORC
''' % (self.getTargetDb(), self.getTargetTable(), formatTableFieldsStr)

        # 创建数据表
        createHiveTableResult = self.getRegisterInstance(
            'hiveModel').createTable(createHiveTableSql)

        # 2. 导入数据到 gather 表
        insertSql = '''
INSERT OVERWRITE TABLE `%(gatherTable)s` PARTITION (`p_dt` = '%(partitionDate)s') SELECT * FROM %(sourceTable)s;
''' % {
            'gatherTable': self.getTargetDb() + '.' + self.getTargetTable(),
            'partitionDate': self.getPartitionDate(),
            'sourceTable': self.getSourceDb() + '.' + self.getSourceTable()
        }
        # 执行导入
        insertResult = self.getRegisterInstance('hiveModel').batchExecuteSql(
            insertSql)

        # 3. 检测执行结果
        if (createHiveTableResult == True and insertResult == True):
            resultCode = 0
        else:
            resultCode = 1

        # 4. 计算执行结果写日志和打印
        # 计算结束日期
        diffTimestamp = self.getRegisterInstance(
            'dateModel').getTimestamp() - startTimestamp

        # mysql 记录日志
        self.extractLog(resultCode, diffTimestamp)

        # 打印日志
        logStr = "(初始化 : " + str(self.getSourceDb()) + "." + str(
            self.getSourceTable()) + " -> " + str(
                self.getTargetDb()) + "." + str(self.getTargetTable(
                )) + " Time : " + str(diffTimestamp) + ")"
        Logger.info(logStr)
Ejemplo n.º 6
0
 def shutdown(self):
     Logger.info("执行结果 access_log : " + str(self.accessLogStatus))
     Logger.info("执行结果 dw_access_log : " + str(self.dwAccessLogStatus))
     Logger.info("执行结果 uba_app_action_log : " +
                 str(self.ubaAppActionLogStatus))
     Logger.info("执行结果 uba_web_visit_log : " +
                 str(self.ubaWebVisitLogStatus))
     Logger.info("执行结果 uba_web_action_log : " +
                 str(self.ubaWebActionLogStatus))
Ejemplo n.º 7
0
    def sourceTableToGatherTable(self):
        # 开始时间
        startTimestamp = self.getRegisterInstance('dateModel').getTimestamp()

        # 1. 获取表结构
        sourceTableFields = self.getRegisterInstance('hiveModel').getFileds(
            self.getSourceDb(), self.getSourceTable())
        gatherTableFields = self.getRegisterInstance('hiveModel').getFileds(
            self.getTargetDb(), self.getTargetTable())

        # 2. 格式化需要导入到 gather table 的字段
        fieldSql = ''
        for curGatherField in gatherTableFields:
            if (curGatherField == 'p_dt'): continue

            if (curGatherField in sourceTableFields):
                fieldSql += '`' + curGatherField + '`,'
            else:
                fieldSql += "'' AS " + '`' + curGatherField + '`,'

        # 祛除最后的逗号
        formatFieldSql = fieldSql[:-1]

        # 3. 拼接 SQL
        gatherTableSql = '''
INSERT OVERWRITE TABLE `%(gatherTable)s` PARTITION (`p_dt` = '%(partitionDate)s') SELECT %(fieldSql)s FROM %(sourceTable)s;
''' % {
            'gatherTable': self.getTargetDb() + '.' + self.getTargetTable(),
            'partitionDate': self.getPartitionDate(),
            'fieldSql': formatFieldSql,
            'sourceTable': self.getSourceDb() + '.' + self.getSourceTable()
        }
        # 执行 SQL
        gatherTableResult = self.getRegisterInstance(
            'hiveModel').batchExecuteSql(gatherTableSql)

        # 4. 检测执行结果
        if (gatherTableResult == True):
            resultCode = 0
        else:
            resultCode = 1

        # 5. 计算执行结果写日志和打印
        # 计算结束日期
        diffTimestamp = self.getRegisterInstance(
            'dateModel').getTimestamp() - startTimestamp

        # mysql 记录日志
        self.extractLog(resultCode, diffTimestamp)

        # 打印日志
        logStr = "(执行聚合 : " + str(self.getSourceDb()) + "." + str(
            self.getSourceTable()) + " -> " + str(
                self.getTargetDb()) + "." + str(self.getTargetTable(
                )) + " Time : " + str(diffTimestamp) + ")"
        Logger.info(logStr)
Ejemplo n.º 8
0
    def runDwSqlProcess(self, parsMap):
        status = False

        try :
            Logger.init()

            sqlFile = parsMap.get('sql')
             # sql file 文件
            if (sqlFile == None):
                Logger.info("sql 仓库文件不存在")
                exit(1)

            # 日期
            parsDate = parsMap.get('date')

            if (parsDate != None ):
    
                if (parsDate == "today") :
                    date = self.getRegisterInstance('dateModel').getToday()
                elif (parsDate == "tomorrow") :
                    date = self.getRegisterInstance('dateModel').getTomorrow()
                elif (parsDate == "yesterday") :
                    date = self.getRegisterInstance('dateModel').getYesterday()
                else :
                    date = parsDate
            else :
                # 默认是昨天日期 ,格式: 20151010
                date = self.getRegisterInstance('dateModel').getYesterday()
    
            # 服务器类型
            serverType = parsMap.get('serverType')
            if (serverType == None ):
                Logger.info("serverType : hive or spark")
                exit(1)


            # 是否是 dw 仓库的 sql 文件
            isDwSql = parsMap.get("isDwSql")
            # 读取 sql 文件内容,并且格式化好时间
            if (isDwSql == None):
                sqlContent = self.getDwSqlContent(parsMap.get('sql'), date)
            elif(isDwSql == "yes"):
               sqlContent = self.getDwSqlContent(parsMap.get('sql'), date)
            elif(isDwSql == "no"):
                sqlContent = self.getSqlContent(parsMap.get('sql'), date)
            else:
                Logger.info("isDwSql 参数: [yes|no]")
                exit(1)


            if (serverType == 'hive'):
                status = self.runSqlByHive(sqlContent, parsMap.get('runEnv'))
            elif (serverType == 'spark'):
                status = self.runSqlBySpark(sqlContent)


        except Exception,ex:
            log = "异常存储过程: "
            log += " -> " + str(Exception) + ":" + str(ex)
            Logger.info(log)
Ejemplo n.º 9
0
    def hdfsMonitor(self, date):
        rs = self.getHdfsMB()
        self.modifyIndicatorSystem(date, 'hadoop_data', rs.get('dataMb'))
        self.modifyIndicatorSystem(date, 'hadoop_hdfs', rs.get('hdfsMb'))

        # 输出日志
        Logger.info("------------------------------")
        Logger.info("hadoop_data: " + str(rs.get('dataMb')))
        Logger.info("hadoop_hdfs: " + str(rs.get('hdfsMb')))
        Logger.info("------------------------------")
Ejemplo n.º 10
0
    def runTable(self, q, threadPoolNum):

        wNum = 1
        while (True):
            # 队列为空的时候退出
            if (q.empty() == True):
                break

            # 当前可消费的队列
            qTableInfo = q.get()

            sourceTb = qTableInfo['db_name'] + '.' + qTableInfo[
                'tb_name'] + '.' + str(qTableInfo['id'])
            Logger.info('线程:' + str(threadPoolNum) + ', 第: ' + str(wNum) +
                        ' 次. ' + str(sourceTb))

            # 执行抽取任务
            self.extractMysqlTable(qTableInfo)
            q.task_done()
            wNum += 1
Ejemplo n.º 11
0
    def extractIncrementalAction(self):

        # 检测 hive 数据表是否存在
        if (self.isExistsHiveTable() == False):
            Logger.info("增量抽取控制: 初始化, 全量抽取...  -> " + str(self.getTbId()) +
                        ': ' + str(self.getSourceDb()) + "." +
                        str(self.getSourceTable()))
            # 不存在全量抽取
            self.extractCompleteAction()
        else:
            # 检测字段变化
            # 有变化时
            if (self.checkStbAndTtbFields() == True):
                Logger.info("增量抽取控制: 结构发生变化, 初始化, 全量抽取...  -> " +
                            str(self.getTbId()) + ': ' +
                            str(self.getSourceDb()) + "." +
                            str(self.getSourceTable()))
                # 全量抽取
                self.extractCompleteAction()
            # 无变化
            else:
                Logger.info("增量抽取控制: 增量抽取...  -> " + str(self.getTbId()) +
                            ': ' + str(self.getSourceDb()) + "." +
                            str(self.getSourceTable()))
                # 增量抽取
                self.extractIncrementalTable()
Ejemplo n.º 12
0
    def snapshotTable(self,dbName,tbName,date):
        try :
            extractConf = self.getRegisterInstance('confModel').getExtractConf()
            
            confSeparator = extractConf['core']['separator']
            confSourceDb = extractConf['snapshot_table']['hive_source_db']
            confTargetDb = extractConf['snapshot_table']['hive_target_db']
    
            # 设置 snapshot表规则
            sourceDb = confSourceDb
            sourceTable = dbName + confSeparator + tbName
            targetDb = confTargetDb
            targetTable = dbName + confSeparator + tbName + '_' + date.replace('-','')
    
            # 开始时间
            startTimestamp = self.getRegisterInstance('dateModel').getTimestamp()

            # 1.生成sql
            createHiveTableSql = '''
DROP TABLE IF EXISTS %(snapshotTbl)s; 
CREATE TABLE IF NOT EXISTS %(snapshotTbl)s AS
SELECT * FROM %(srcTbl)s;'''%{'srcTbl':sourceDb + '.' + sourceTable,
                             'snapshotTbl': targetDb + '.' + targetTable}
            
            Result = self.getRegisterInstance('hiveModel').batchExecuteSql(createHiveTableSql)
            # 2.计算执行结果写日志和打印
            diffTimestamp = self.getRegisterInstance('dateModel').getTimestamp() - startTimestamp
            logSql = "INSERT INTO dw_service.snapshot_log (`source_db`,`source_table`,`target_db`,`target_table`,`code`,`run_time`) "
            logSql += "VALUES ('%s','%s','%s','%s',%d,%d)"
            logSql = logSql%(sourceDb,sourceTable,targetDb,targetTable,int(Result),diffTimestamp)
            self.getRegisterInstance('biDbModel').insertData(logSql)
 
            # 打印日志
            logStr = str(sourceDb) + "." + str(sourceTable) + " -> " + str(targetDb) +  "." + str(targetTable) + " Time : " + str(diffTimestamp) + ")"
            Logger.info(logStr)
             
        except Exception,ex:
            log = "异常->  " + str(sourceDb) + "." + str(sourceTable) 
            log += " -> " + str(Exception) + ":" + str(ex)
            Logger.info(log)
Ejemplo n.º 13
0
    def extractMysqlSqoop(self):
        Logger.info("---------- sqoop 开始 " + str(self.getTbId()) + ': ' +
                    str(self.getSourceDb()) + "." +
                    str(self.getSourceTable()) + " ----------")

        # 开始时间
        startTimestamp = self.getRegisterInstance('dateModel').getTimestamp()

        # 1. 删除目标 Hive 对应表
        Logger.info("删除目标 Hive 对应表")
        self.getRegisterInstance('hiveModel').dropTable(self.getTargetDb() +
                                                        "." +
                                                        self.getTargetTable())

        # 2. 执行 Sqoop Mysql 导入到 Hive
        Logger.info("执行 Sqoop Mysql 导入到 Hive")
        self.getRegisterInstance('sqoopModel').setDbServer(self.getExtractDb())
        self.getRegisterInstance('sqoopModel').setSourceDb(self.getSourceDb())
        self.getRegisterInstance('sqoopModel').setSourceTable(
            self.getSourceTable())
        self.getRegisterInstance('sqoopModel').setTargetDb(self.getTargetDb())
        self.getRegisterInstance('sqoopModel').setTargetTable(
            self.getTargetTable())
        self.getRegisterInstance('sqoopModel').setMapReduceNum(
            self.getMapReduceNum())
        result = self.getRegisterInstance('sqoopModel').importMysqlToHive()

        # 3. 检测执行结果
        if (result['code'] == 0):
            resultCode = 0
        else:
            resultCode = result['code']

        # 4. 计算执行结果写日志和打印
        # 计算结束日期
        diffTimestamp = self.getRegisterInstance(
            'dateModel').getTimestamp() - startTimestamp

        #mysql 记录日志
        self.extractLog(resultCode, diffTimestamp)

        # 打印日志
        logStr = "全量抽取 : (Sqoop : " + str(self.getTbId()) + ': ' + str(
            self.getSourceDb()) + "." + str(self.getSourceTable(
            )) + " -> " + str(self.getTargetDb()) + "." + str(
                self.getTargetTable()) + " Time : " + str(diffTimestamp) + ")"
        Logger.info(logStr)
Ejemplo n.º 14
0
    def runSqlBySpark(self, sqlContent):
        Logger.info(sqlContent)
        Logger.info("执行中.....")

        u'提交到 spark jdbc 执行'
        status =  self.getRegisterInstance('sparkModel').batchExecuteSql(sqlContent)

        u'打印日志'
        Logger.info(self.runLog("运行结果",status))

        return status
Ejemplo n.º 15
0
    def schedulerJob(self, date):
        sc = SchedulerJob()

        # 跑存储过程
        sc.storedProcedure(date)

        # job 总数量
        jobCount = sc.getJobCount(date)
        etlJobCnStatus = self.modifyIndicatorSystem(date, 'etl_job_cn',
                                                    jobCount)

        # job 总平均运行时间
        jobAvg = sc.getJobTotalAvgTime(date)
        etlJobRunAvgStatus = self.modifyIndicatorSystem(
            date, 'etl_job_run_avg', jobAvg)

        # 输出日志
        Logger.info("------------------------------")
        Logger.info("etl_job_cn: " + str(jobCount) + " ," +
                    str(etlJobCnStatus))
        Logger.info("etl_job_run_avg: " + str(jobAvg) + " ," +
                    str(etlJobRunAvgStatus))
        Logger.info("------------------------------")
Ejemplo n.º 16
0
    def hiveMetadata(self, date):
        hm = HiveMetadata()

        # 跑存储过程
        hm.storedProcedure(date)

        # hive 数据表总条数
        hive_table_cn = hm.getHiveTableCount(date)
        hiveTableCnStatus = self.modifyIndicatorSystem(date, 'hive_table_cn',
                                                       hive_table_cn)

        # 输出日志
        Logger.info("------------------------------")
        Logger.info("hive_table_cn: " + str(hive_table_cn) + " ," +
                    str(hiveTableCnStatus))
        Logger.info("------------------------------")
Ejemplo n.º 17
0
    def minireportJob(self, date):

        mr = MinireportJob()

        # 跑存储过程
        mr.storedProcedure(date)

        # minireprot 总数量
        minireportCount = mr.getMinireportCount(date)
        minireportCnStatus = self.modifyIndicatorSystem(
            date, 'minireport_cn', minireportCount)

        # minireprot总平均运行时间
        minireportAvg = mr.getMinireportTotalAvgTime(date)
        minireportRunAvgStatus = self.modifyIndicatorSystem(
            date, 'minireport_run_avg', minireportAvg)

        # 输出日志
        Logger.info("------------------------------")
        Logger.info("minireport_cn: " + str(minireportCount) + " ," +
                    str(minireportCnStatus))
        Logger.info("minireport_run_avg: " + str(minireportAvg) + " ," +
                    str(minireportRunAvgStatus))
        Logger.info("------------------------------")
Ejemplo n.º 18
0
    def runSqlByHive(self, sqlContent, runEnv):
        Logger.info(sqlContent)
        Logger.info("执行中.....")

        status = False

        # 运行环境
        if (runEnv == "local"):
            u'提交到 Hive 本地 执行'
            status = self.getRegisterInstance('hiveModel').runHiveScript(sqlContent)
            u'提交到 Hive jdbc 执行'
        elif (runEnv == "hiveserver2"):
            status =  self.getRegisterInstance('hiveModel').batchExecuteSql(sqlContent)
        else :
            status =  self.getRegisterInstance('hiveModel').batchExecuteSql(sqlContent)

        u'打印日志'
        Logger.info(self.runLog("运行结果",status))
 
        return status
Ejemplo n.º 19
0
 def shutdown(self):
     Logger.info("执行结果 dw_app_access_log : " + str(self.dwAppAccessLogStatus))
     Logger.info("执行结果 dw_app_action_detail_log : " + str(self.dwAppActionDetailLogStatus))
     Logger.info("执行结果 dw_web_visit_traffic_log : " + str(self.dwWebVisitTrafficLogStatus))
     Logger.info("执行结果 dw_web_action_detail_log : " + str(self.dwWebActionDetailLogStatus))
Ejemplo n.º 20
0
    def extractMysqlTable(self, tableInfo):
        try:
            extractConf = self.getRegisterInstance(
                'confModel').getExtractConf()

            # 默认表分隔符
            confSeparator = extractConf['core']['separator']
            # 抽取到目标 hive 数据库名
            confTargetDb = extractConf['extract_mysql']['hive_target_db']
            # dump 本地临时目录
            confDumpFileDir = extractConf['extract_mysql']['dump_file_dir']

            # 数据库配置表信息

            # tb id
            tbId = tableInfo['id']
            # mysql 数据库源信息
            dbServer = tableInfo['db_server']
            # 数据源: 数据库名
            dbName = tableInfo['db_name']
            # 数据源: 表名
            tbName = tableInfo['tb_name']
            # 目标 hive 数据库名
            dbTargetDbName = tableInfo['target_db_name']
            # 目标 hive 表名
            dbTargetTbName = tableInfo['target_tb_name']
            # 抽取工具
            extractTool = tableInfo['extract_tool']
            # 抽取类型
            extractType = tableInfo['extract_type']

            # 设置 hive 表名的规则
            # 当指定了抽取的目标库, 使用指定的库和表名
            if (dbTargetDbName != "" and dbTargetTbName != ""):
                affirmTargetDb = dbTargetDbName
                affirmTargetTb = dbTargetTbName
            # 没有使用则用默认的规则
            else:
                affirmTargetDb = confTargetDb
                affirmTargetTb = dbName + confSeparator + tbName

            # 实例化抽取对象
            extractMysql = ExtractMysql()
            # Dump 方式时的保存目录
            extractMysql.setDumpFileDir(confDumpFileDir)

            # 设置抽取的数据库源
            if (dbServer == ExtractMysql.PRODUCE_DB):
                extractMysql.setExtractDb(ExtractMysql.PRODUCE_DB)
            elif (dbServer == ExtractMysql.DW_DB):
                extractMysql.setExtractDb(ExtractMysql.DW_DB)
            else:
                Logger.info("抽取的数据源不存在!" + dbServer)

            # 设置抽取类型
            # 全量抽取
            if (extractType == ExtractMysql.COMPLETE):
                extractMysql.setExtractType(ExtractMysql.COMPLETE)
            # 增量抽取
            elif (extractType == ExtractMysql.INCREMENTAL):
                extractMysql.setExtractType(ExtractMysql.INCREMENTAL)
            else:
                Logger.info("抽取数据类型不存在!" + extractType)

            # 配置指定抽取的工具
            if (extractTool == ExtractMysql.MYSQL_DUMP):
                extractMysql.setExtractTool(ExtractMysql.MYSQL_DUMP)
            elif (extractTool == ExtractMysql.SQOOP):
                extractMysql.setExtractTool(ExtractMysql.SQOOP)
                extractMysql.setMapReduceNum(5)

            # 设置抽取表的信息
            sourceDb = dbName
            sourceTable = tbName
            targetDb = affirmTargetDb
            targetTable = affirmTargetTb

            extractMysql.setTbId(tbId)
            extractMysql.setSourceDb(sourceDb)
            extractMysql.setSourceTable(sourceTable)
            extractMysql.setTargetDb(targetDb)
            extractMysql.setTargetTable(targetTable)
            extractMysql.run()

        except Exception, ex:
            log = "异常->  数据表: " + str(dbServer) + ": " + str(
                dbName) + "." + str(tbName)
            log += " -> " + str(Exception) + ":" + str(ex)
            Logger.info(log)
Ejemplo n.º 21
0
    def extractIncrementalTable(self):
        Logger.info("---------- 增量抽取开始 " + str(self.getTbId()) + ': ' +
                    str(self.getSourceDb()) + "." +
                    str(self.getSourceTable()) + " ----------")

        # 开始时间
        startTimestamp = self.getRegisterInstance('dateModel').getTimestamp()

        # 设置增量表属性
        Logger.info("设置增量表属性")
        tableInfoExt = self.getRegisterInstance(
            'biDbModel').getExtractMysqlTableExt(self.getTbId())
        self.setIncrementalAttribute(tableInfoExt)

        # 获取增量表的属性
        Logger.info("获取增量表的属性")
        incTbAttr = self.getIncrementalAttribute()
        primaryKey = incTbAttr['primary_key']
        incrementalField = incTbAttr['incremental_field']
        incrementalVal = incTbAttr['incremental_val']
        conditions = incTbAttr['conditions']

        # hive 目标表
        targetTb = self.getTargetDb() + "." + self.getTargetTable()
        # hive 增量表
        incTb = targetTb + "__inc"

        # 1. 删除增量抽取表
        Logger.info("删除增量抽取表")
        self.getRegisterInstance('hiveModel').dropTable(incTb)

        # 2. 创建增量抽取表
        Logger.info("创建增量抽取表")
        createHiveTableSql = "CREATE TABLE " + incTb + " LIKE " + targetTb
        createHiveTableResult = self.getRegisterInstance(
            'hiveModel').createTable(createHiveTableSql)

        incDumpSql = ""
        # 3. 读取最新的增量数据到本地文件中
        if (incrementalVal == ""):
            # 获取目标表,最大的字段数, 已这个作为基地抽取数据
            Logger.info("获取目标表,最大的字段数, 已这个作为基地抽取数据")
            targetTbMaxPointVal = self.getHiveTbMaxVal(targetTb,
                                                       incrementalField)

            incDumpSql = "SELECT * FROM " + self.getSourceDb(
            ) + "." + self.getSourceTable(
            ) + " WHERE " + incrementalField + conditions + "'" + str(
                targetTbMaxPointVal) + "'"

            # 更新 point 点
            Logger.info("更新 point 点")
            self.updateTableExt(tableInfoExt['id'], targetTbMaxPointVal)
        else:
            incDumpSql = "SELECT * FROM " + self.getSourceDb(
            ) + "." + self.getSourceTable(
            ) + " WHERE " + incrementalField + conditions + "'" + incrementalVal + "'"

        Logger.info("dump 更新数据到本地")
        dumpIncFile = self.getDumpFileDir() + "/" + self.getDumpFileName(
        ) + ".inc"
        dumpIncResult = self.extractDbServerModel.mysqlDumpFile(
            incDumpSql, dumpIncFile)

        # 4. 上传 dump 文件到 incHiveTable 中
        Logger.info("上传 dump 文件到 incHiveTable 中")
        hiveLoadSql = "LOAD DATA LOCAL INPATH '" + dumpIncFile + "' OVERWRITE INTO TABLE " + incTb + ";"
        hiveLoadResult = self.getRegisterInstance('hiveModel').runHiveScript(
            hiveLoadSql)

        # 获取增量表本次最大一条增量的增量字段值
        Logger.info("获取增量表本次最大一条增量的增量字段值")
        incTbMaxPointVal = self.getHiveTbMaxVal(incTb, incrementalField)
        # 当抽取的增量数据为空时, 不做任何处理, 直接退出
        if (incTbMaxPointVal == None):
            Logger.info("增量数据为空...")
            return

        # 5. 执行存储过程完成增量
        incHiveSql = """
INSERT OVERWRITE TABLE %(targetTb)s
SELECT *
FROM (
    SELECT a.*
    FROM %(targetTb)s AS a
    LEFT JOIN %(incTb)s AS b
        ON a.%(primaryKey)s = b.%(primaryKey)s
    WHERE b.%(primaryKey)s IS NULL
) AS bs

UNION ALL
SELECT * FROM %(incTb)s
;""" % {
            'targetTb': targetTb,
            'incTb': incTb,
            'primaryKey': primaryKey
        }

        Logger.info(incHiveSql)

        # 6. 最终逻辑运算使用 spark sql
        Logger.info("最终逻辑运算使用 spark sql")
        incSqlResult = self.getRegisterInstance('sparkModel').batchExecuteSql(
            incHiveSql)
        incSqlResult = True

        # 7. 检测执行结果
        if (dumpIncResult['code'] == 0 and hiveLoadResult['code'] == 0
                and createHiveTableResult == True and incSqlResult == True):

            # 更新节点
            if (incTbMaxPointVal != None):

                self.updateTableExt(tableInfoExt['id'], incTbMaxPointVal)

            resultCode = 0
        else:
            resultCode = 1

        # 计算执行结果写日志和打印
        # 计算结束日期
        diffTimestamp = self.getRegisterInstance(
            'dateModel').getTimestamp() - startTimestamp

        # mysql 记录日志
        self.extractLog(resultCode, diffTimestamp)

        # 打印日志
        logStr = "增量抽取 : (Dump : " + str(self.getTbId()) + ': ' + str(
            self.getSourceDb()) + "." + str(self.getSourceTable(
            )) + " -> " + str(self.getTargetDb()) + "." + str(
                self.getTargetTable()) + " Time : " + str(diffTimestamp) + ")"
        Logger.info(logStr)
Ejemplo n.º 22
0
    def dataWarehouse(self, date):
        # 获取昨天日期,因为每次统一的是当前天数昨天的日期
        offsetDay = self.getRegisterInstance('dateModel').getOffsetDateDay(
            date, -1)

        # dw_web_visit_traffic_log 数据表统计
        dwWebVisitTrafficLogCn = self.getTableCountForDate(
            'dw_db.dw_web_visit_traffic_log', offsetDay)
        dwWebVisitTrafficLogCnStatus = self.modifyIndicatorSystem(
            date, 'dw_web_visit_traffic_log', dwWebVisitTrafficLogCn)

        # dw_web_action_detail_log 数据表统计
        dwWebActionDetailLogCn = self.getTableCountForDate(
            'dw_db.dw_web_action_detail_log', offsetDay)
        dwWebActionDetailLogCnStatus = self.modifyIndicatorSystem(
            date, 'dw_web_action_detail_log', dwWebActionDetailLogCn)

        # dw_app_access_log 数据表统计
        dwAppAccessLogCn = self.getTableCountForDate('dw_db.dw_app_access_log',
                                                     offsetDay)
        dwAppAccessLogCnStatus = self.modifyIndicatorSystem(
            date, 'dw_app_access_log', dwAppAccessLogCn)

        # dw_app_action_detail_log 数据表统计
        dwAppActionDetailLogCn = self.getTableCountForDate(
            'dw_db.dw_app_action_detail_log', offsetDay)
        dwAppActionDetailLogCnStatus = self.modifyIndicatorSystem(
            date, 'dw_app_action_detail_log', dwAppActionDetailLogCn)

        # dw_property_inventory_sd 数据表统计
        dwPropertyInventorySdCn = self.getTableCountForDate(
            'dw_db.dw_property_inventory_sd', offsetDay)
        dwPropertyInventorySdCnStatus = self.modifyIndicatorSystem(
            date, 'dw_property_inventory_sd', dwPropertyInventorySdCn)

        # 输出日志
        Logger.info("------------------------------")
        Logger.info(str(date) + " " + str(offsetDay))
        Logger.info("dw_web_visit_traffic_log: " +
                    str(dwWebVisitTrafficLogCn) + " ," +
                    str(dwWebVisitTrafficLogCnStatus))
        Logger.info("dw_web_action_detail_log: " +
                    str(dwWebActionDetailLogCn) + " ," +
                    str(dwWebActionDetailLogCnStatus))
        Logger.info("dw_app_access_log: " + str(dwAppAccessLogCn) + " ," +
                    str(dwAppAccessLogCnStatus))
        Logger.info("dw_app_action_detail_log: " +
                    str(dwAppActionDetailLogCn) + " ," +
                    str(dwAppActionDetailLogCnStatus))
        Logger.info("dw_property_inventory_sd: " +
                    str(dwPropertyInventorySdCn) + " ," +
                    str(dwPropertyInventorySdCnStatus))
        Logger.info("------------------------------")
Ejemplo n.º 23
0
    def extractMysqlDump(self):
        Logger.info("---------- mysqlDump 开始 " + str(self.getTbId()) + ': ' +
                    str(self.getSourceDb()) + "." +
                    str(self.getSourceTable()) + " ---------- ")
        # 开始时间
        startTimestamp = self.getRegisterInstance('dateModel').getTimestamp()

        # 1. 删除目标 Hive 对应表
        Logger.info("删除目标 Hive 对应表")
        self.getRegisterInstance('hiveModel').dropTable(self.getTargetDb() +
                                                        "." +
                                                        self.getTargetTable())

        # 2. dump mysql 数据到文件中
        Logger.info("dump mysql 数据到文件中")
        dumpSql = "SELECT * FROM " + self.getSourceDb(
        ) + "." + self.getSourceTable()
        dumpFile = self.getDumpFileDir() + "/" + self.getDumpFileName()
        dumpResult = self.extractDbServerModel.mysqlDumpFile(dumpSql, dumpFile)

        # 3. 根据 Mysql 表结构创建 Hive 表结构
        Logger.info("根据 Mysql 表结构创建 Hive 表结构")
        sourceTableFields = self.getSourceTableFields()
        formatTableFieldsList = []
        for curField in sourceTableFields:
            formatTableFieldsList.append('`' + curField + '`')
        formatTableFieldsStr = ' String,'.join(
            formatTableFieldsList) + " String"

        createHiveTableSql = '''
CREATE TABLE IF NOT EXISTS  %s.%s (
%s
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\\001'
COLLECTION ITEMS TERMINATED BY '\\n'
STORED AS TEXTFILE
''' % (self.getTargetDb(), self.getTargetTable(), formatTableFieldsStr)

        # 执行 Hive 创建表
        Logger.info("执行 Hive 创建表")
        createHiveTableResult = self.getRegisterInstance(
            'hiveModel').createTable(createHiveTableSql)

        # 4. 上传 dump 文件到 HiveTable 中
        Logger.info("上传 dump 文件到 HiveTable 中")
        hiveLoadSql = "LOAD DATA LOCAL INPATH '" + self.getDumpFileDir(
        ) + "/" + self.getDumpFileName(
        ) + "' OVERWRITE INTO TABLE " + self.getTargetDb(
        ) + "." + self.getTargetTable() + ";"
        hiveLoadResult = self.getRegisterInstance('hiveModel').runHiveScript(
            hiveLoadSql)

        # 5. 检测执行结果
        if (dumpResult['code'] == 0 and createHiveTableResult == True
                and hiveLoadResult['code'] == 0):
            resultCode = 0
        else:
            resultCode = hiveLoadResult['code']

        # 6. 计算执行结果写日志和打印
        # 计算结束日期
        diffTimestamp = self.getRegisterInstance(
            'dateModel').getTimestamp() - startTimestamp

        # mysql 记录日志
        self.extractLog(resultCode, diffTimestamp)

        # 打印日志
        logStr = "全量抽取 : (Dump : " + str(self.getTbId()) + ': ' + str(
            self.getSourceDb()) + "." + str(self.getSourceTable(
            )) + " -> " + str(self.getTargetDb()) + "." + str(
                self.getTargetTable()) + " Time : " + str(diffTimestamp) + ")"
        Logger.info(logStr)