def cache_running_info(self, func_id, script_publish_version, exec_mode=None, is_failed=False, cost=None): timestamp = int(time.time()) # 全局计数 data = { 'funcId': func_id, 'scriptPublishVersion': script_publish_version, 'execMode': exec_mode, 'isFailed': is_failed, 'cost': cost, 'timestamp': timestamp, } data = toolkit.json_dumps(data, indent=0) cache_key = toolkit.get_cache_key('syncCache', 'scriptRunningInfo') self.cache_db.lpush(cache_key, data) # 函数调用记数 data = { 'funcId': func_id, 'timestamp': timestamp, } data = toolkit.json_dumps(data, indent=0) cache_key = toolkit.get_cache_key('syncCache', 'funcCallInfo') self.cache_db.lpush(cache_key, data)
def clear_deprecated_data(self): self.clear_table('biz_main_script_log') self.clear_table('biz_main_script_failure') self.clear_table('biz_main_batch_task_info') self.clear_table('biz_main_crontab_task_info') self.clear_cache_key(toolkit.get_cache_key('syncCache', 'scriptFailure')) self.clear_cache_key(toolkit.get_cache_key('syncCache', 'scriptLog')) self.clear_cache_key(toolkit.get_cache_key('syncCache', 'taskInfo'))
def _cache_scripts(self): scripts = sorted(SCRIPT_MAP.values(), key=lambda x: x['seq']) scripts_dump = toolkit.json_dumps(scripts, sort_keys=True) cache_key = toolkit.get_cache_key('fixedCache', 'scriptsMD5') self.cache_db.set(cache_key, toolkit.get_md5(scripts_dump)) cache_key = toolkit.get_cache_key('fixedCache', 'scriptsDump') self.cache_db.set(cache_key, scripts_dump)
def clear_outdated_task_info(self): origin_ids = set() # 集成函数自动触发配置永不过期 origin_ids.add(CONFIG['_INTEGRATION_CRONTAB_CONFIG_ID']) # 自动触发配置ID sql = ''' SELECT id FROM biz_main_crontab_config ''' db_res = self.db.query(sql) for d in db_res: origin_ids.add(d['id']) # 批处理ID sql = ''' SELECT id FROM biz_main_batch ''' db_res = self.db.query(sql) for d in db_res: origin_ids.add(d['id']) # 获取所有任务信息Key cache_pattern = toolkit.get_cache_key('syncCache', 'taskInfo', tags=[ 'originId', '*' ]) cache_res = self.cache_db.keys(cache_pattern) for cache_key in cache_res: cache_key_info = toolkit.parse_cache_key(cache_key) if cache_key_info['tags']['originId'] not in origin_ids: self.cache_db.delete(cache_key)
def cache_task_status(self, origin, origin_id, exec_mode, status, func_id=None, script_publish_version=None, log_messages=None, einfo_text=None): if not all([origin, origin_id]): return if origin not in ('crontab', 'batch') and exec_mode != 'crontab': return cache_key = toolkit.get_cache_key('syncCache', 'taskInfo') data = { 'taskId': self.request.id, 'origin': origin, 'originId': origin_id, 'funcId': func_id, 'scriptPublishVersion': script_publish_version, 'execMode': exec_mode, 'status': status, 'logMessages': log_messages, 'einfoTEXT': einfo_text, 'timestamp': int(time.time()), } data = toolkit.json_safe_dumps(data, indent=0) self.cache_db.run('lpush', cache_key, data)
def cache_script_failure(self, func_id, script_publish_version, exec_mode=None, einfo_text=None, trace_info=None): if not CONFIG['_INTERNAL_KEEP_SCRIPT_FAILURE']: return if not einfo_text: return cache_key = toolkit.get_cache_key('syncCache', 'scriptFailure') data = { 'funcId': func_id, 'scriptPublishVersion': script_publish_version, 'execMode': exec_mode, 'einfoTEXT': einfo_text, 'traceInfo': trace_info, 'timestamp': int(time.time()), } data = toolkit.json_safe_dumps(data, indent=0) self.cache_db.run('lpush', cache_key, data)
def dataflux_func_auto_run(self, *args, **kwargs): lock_key = toolkit.get_cache_key('lock', 'autoRun') lock_value = toolkit.gen_uuid() if not self.cache_db.lock(lock_key, lock_value, 30): self.logger.warning('DataFluxFunc AutoRun Task already launched.') return self.logger.info('DataFluxFunc AutoRun Task launched.') # 获取函数功能集成自动运行函数 integrated_auto_run_funcs = self.get_integrated_auto_run_funcs() for f in integrated_auto_run_funcs: # 任务ID task_id = gen_task_id() # 任务参数 task_kwargs = { 'funcId' : f['id'], 'origin' : 'integration', 'execMode': 'async', 'queue' : CONFIG['_FUNC_TASK_DEFAULT_QUEUE'], } # 自动运行总是使用默认队列 queue = toolkit.get_worker_queue(CONFIG['_FUNC_TASK_DEFAULT_QUEUE']) dataflux_func_runner.apply_async(task_id=task_id, kwargs=task_kwargs, queue=queue)
def dataflux_func_auto_cleaner(self, *args, **kwargs): lock_key = toolkit.get_cache_key('lock', 'autoCleaner') lock_value = toolkit.gen_uuid() if not self.cache_db.lock(lock_key, lock_value, 30): self.logger.warning('DataFluxFunc AutoCleaner Task already launched.') return self.logger.info('DataFluxFunc AutoCleaner Task launched.') # 清空数据库数据 if not CONFIG['_INTERNAL_KEEP_SCRIPT_LOG']: self.clear_table('biz_main_script_log') if not CONFIG['_INTERNAL_KEEP_SCRIPT_FAILURE']: self.clear_table('biz_main_script_failure') # 回卷数据库数据 table_limit_map = CONFIG['_DBDATA_TABLE_LIMIT_MAP'] for table, limit in table_limit_map.items(): try: self.clear_table_by_limit(table=table, limit=int(limit)) except Exception as e: for line in traceback.format_exc().splitlines(): self.logger.error(line) # 回卷上传文件目录 upload_file_expires = CONFIG['_UPLOAD_FILE_EXPIRES'] self.clear_upload_file_by_expires(expires=upload_file_expires)
def dataflux_func_worker_queue_pressure_recover(self, *args, **kwargs): self.logger.info('DataFluxFunc Worker Queue Pressure Recover Task launched.') for i in range(CONFIG['_WORKER_QUEUE_COUNT']): queue_key = toolkit.get_worker_queue(i) queue_length = self.cache_db.run('llen', queue_key) if not queue_length or int(queue_length) <= 0: cache_key = toolkit.get_cache_key('cache', 'workerQueuePressure', tags=['workerQueue', i]) self.cache_db.run('set', cache_key, 0)
def lock(self, max_age=60): lock_key = toolkit.get_cache_key('lock', self.name) lock_value = toolkit.gen_uuid() if not self.cache_db.lock(lock_key, lock_value, max_age): self.logger.warning(f"`{self.name}` Task already launched.") return self.launch_log() return lock_key, lock_value
def dataflux_func_reload_scripts(self, *args, **kwargs): is_startup = kwargs.get('isOnLaunch') or False force = kwargs.get('force') or False # 启动时执行的,需要上锁 if is_startup: lock_key = toolkit.get_cache_key('lock', 'reloadScripts') lock_value = toolkit.gen_uuid() if not self.cache_db.lock(lock_key, lock_value, 10): self.logger.warning( 'DataFluxFunc ReloadScriptDict Task already launched.') return self.logger.info('DataFluxFunc ReloadScriptDict Task launched.') cache_key = toolkit.get_cache_key('fixedCache', 'prevDBUpdateTimestamp') # 上次脚本更新时间 prev_publish_timestamp = float(self.cache_db.get(cache_key) or 0.0) if not prev_publish_timestamp: force = True # 最近脚本更新时间 latest_publish_timestamp = self.get_latest_publish_timestamp() is_script_reloaded = False if force: self.force_reload_script() is_script_reloaded = True elif latest_publish_timestamp != prev_publish_timestamp: self.reload_script() is_script_reloaded = True if is_script_reloaded: self.logger.debug('[SCRIPT CACHE] Reload script {} -> {} {}'.format( arrow.get(prev_publish_timestamp).to('Asia/Shanghai').format( 'YYYY-MM-DD HH:mm:ss'), arrow.get(latest_publish_timestamp).to('Asia/Shanghai').format( 'YYYY-MM-DD HH:mm:ss'), '[FORCE]' if force else '')) self.cache_db.set(cache_key, str(latest_publish_timestamp))
def cache_func_pressure(self, func_id, func_call_kwargs_md5, func_pressure, func_cost, queue): if not all([func_id, func_call_kwargs_md5, func_pressure, func_cost, queue]): return # 获取队列最大压力 worker_queue_max_pressure = CONFIG['_WORKER_LIMIT_WORKER_QUEUE_PRESSURE_BASE'] cache_key = toolkit.get_cache_key('heartbeat', 'workerOnQueueCount', tags=['workerQueue', queue]) worker_count = self.cache_db.get(cache_key) if not worker_count: worker_count = 1 else: worker_count = int(worker_count) or 1 worker_queue_max_pressure = worker_count * CONFIG['_WORKER_LIMIT_WORKER_QUEUE_PRESSURE_BASE'] # 计算并记录新函数压力 cache_key = toolkit.get_cache_key('cache', 'funcPressure', tags=[ 'funcId' , func_id, 'funcCallKwargsMD5', func_call_kwargs_md5]) prev_func_pressure = self.cache_db.get(cache_key) if prev_func_pressure: prev_func_pressure = int(prev_func_pressure) else: prev_func_pressure = CONFIG['_WORKER_LIMIT_FUNC_PRESSURE_BASE'] next_func_pressure = int((prev_func_pressure + func_cost) / 2) self.cache_db.setex(cache_key, CONFIG['_WORKER_LIMIT_FUNC_PRESSURE_EXPIRES'], next_func_pressure) # 任务结束,减少队列压力 cache_key = toolkit.get_cache_key('cache', 'workerQueuePressure', tags=['workerQueue', queue]) current_worker_queue_pressure = self.cache_db.run('decrby', cache_key, func_pressure) self.cache_db.run('expire', cache_key, CONFIG['_WORKER_LIMIT_WORKER_QUEUE_PRESSURE_EXPIRES']) self.logger.debug('<<< FUNC PRESSURE >>> {}: {}, Cost: {}'.format(func_id, func_pressure, func_cost)) self.logger.debug('<<< QUEUE PRESSURE >>> WorkerQueue#{}: {} (-{}, {}%)'.format( queue, current_worker_queue_pressure, abs(func_pressure), int(current_worker_queue_pressure / worker_queue_max_pressure * 100)))
def cache_func_result(self, func_id, script_code_md5, script_publish_version, func_call_kwargs_md5, result, cache_result_expires): if not all([func_id, script_code_md5, script_publish_version, func_call_kwargs_md5, cache_result_expires]): return cache_key = toolkit.get_cache_key('cache', 'funcResult', tags=[ 'funcId' , func_id, 'scriptCodeMD5' , script_code_md5, 'scriptPublishVersion', script_publish_version, 'funcCallKwargsMD5' , func_call_kwargs_md5]) result_dumps = toolkit.json_safe_dumps(result) self.cache_db.setex(cache_key, cache_result_expires, result_dumps)
def cache_script_running_info(self, func_id, script_publish_version, exec_mode=None, is_failed=False, cost=None): cache_key = toolkit.get_cache_key('syncCache', 'scriptRunningInfo') data = { 'funcId' : func_id, 'scriptPublishVersion': script_publish_version, 'execMode' : exec_mode, 'isFailed' : is_failed, 'cost' : cost, 'timestamp' : int(time.time()), } data = toolkit.json_safe_dumps(data, indent=0) self.cache_db.run('lpush', cache_key, data)
def dataflux_func_auto_cleaner(self, *args, **kwargs): lock_key = toolkit.get_cache_key('lock', 'autoCleaner') lock_value = toolkit.gen_uuid() if not self.cache_db.lock(lock_key, lock_value, 30): self.logger.warning('DataFluxFunc AutoCleaner Task already launched.') return self.logger.info('DataFluxFunc AutoCleaner Task launched.') # 清除数据 table_limit_map = CONFIG['_DBDATA_TABLE_LIMIT_MAP'] for table, limit in table_limit_map.items(): try: self.clear_table_by_limit(table=table, limit=int(limit)) except Exception as e: for line in traceback.format_exc().splitlines(): self.logger.error(line)
def cache_task_status(self, crontab_id, task_id, func_id): if not crontab_id: return cache_key = toolkit.get_cache_key('syncCache', 'taskInfo') data = { 'taskId': task_id, 'origin': 'crontab', 'originId': crontab_id, 'funcId': func_id, 'status': 'queued', 'timestamp': int(time.time()), } data = toolkit.json_safe_dumps(data, indent=0) self.cache_db.run('lpush', cache_key, data)
def trim_task_info(self, origin, origin_id, exec_mode, task_info_limit=None): if not self.is_support_task_info(origin, origin_id, exec_mode): return task_info_limit = task_info_limit or CONFIG['_TASK_INFO_DEFAULT_LIMIT'] task_info_limit = task_info_limit - 1 if task_info_limit < 0: task_info_limit = 0 _start = 0 _stop = task_info_limit - 1 cache_key = toolkit.get_cache_key('syncCache', 'taskInfo', tags=['originId', origin_id]) self.cache_db.run('ltrim', cache_key, _start, _stop)
def cache_script_log(self, func_id, script_publish_version, log_messages, exec_mode=None): if not CONFIG['_INTERNAL_KEEP_SCRIPT_LOG']: return if not log_messages: return cache_key = toolkit.get_cache_key('syncCache', 'scriptLog') data = { 'funcId' : func_id, 'scriptPublishVersion': script_publish_version, 'execMode' : exec_mode, 'logMessages' : log_messages, 'timestamp' : int(time.time()), } data = toolkit.json_safe_dumps(data, indent=0) self.cache_db.run('lpush', cache_key, data)
def sync_func_call_count(self): data = [] # 搜集数据 cache_key = toolkit.get_cache_key('syncCache', 'funcCallInfo') for i in range(CONFIG['_SYNC_CACHE_BATCH_COUNT']): cache_res = self.cache_db.run('rpop', cache_key) if not cache_res: break try: cache_res = toolkit.json_loads(cache_res) except Exception as e: for line in traceback.format_exc().splitlines(): self.logger.error(line) else: data.append(cache_res) # 归类计算 count_map = {} for d in data: func_id = d['funcId'] timestamp = d.get('timestamp') # 时间戳按照分钟对齐(减少内部时序数据存储压力) timestamp = int(int(timestamp) / 60) * 60 pk = '~'.join([func_id, str(timestamp)]) if pk not in count_map: count_map[pk] = { 'funcId' : func_id, 'timestamp': timestamp, 'count' : 0 } count_map[pk]['count'] += 1 # 写入时序数据 for pk, c in count_map.items(): cache_key = toolkit.get_server_cache_key('monitor', 'sysStats', ['metric', 'funcCallCount', 'funcId', c['funcId']]); self.cache_db.ts_add(cache_key, c['count'], timestamp=c['timestamp'], mode='addUp')
def reload_scripts(self, *args, **kwargs): is_startup = kwargs.get('isOnLaunch') or False is_crontab = kwargs.get('isOnCrontab') or False force = kwargs.get('force') or False # 启动时执行/Crontab执行的,需要上锁 if is_startup or is_crontab: self.lock(max_age=10) else: self.launch_log() cache_key = toolkit.get_cache_key('fixedCache', 'prevScriptDataHash') # 上次脚本更新时间 prev_script_data_hash = self.cache_db.get(cache_key) if not prev_script_data_hash: force = True else: prev_script_data_hash = six.ensure_str(prev_script_data_hash) # 最新脚本数据Hash latest_script_data_hash = self.get_latest_script_data_hash() is_script_reloaded = False if force: self.force_reload_script() is_script_reloaded = True elif latest_script_data_hash != prev_script_data_hash: self.reload_script() is_script_reloaded = True if is_script_reloaded: self.logger.info('[SCRIPT CACHE] Reload script {} -> {} {}'.format( prev_script_data_hash, latest_script_data_hash, '[FORCE]' if force else '')) self.cache_db.set(cache_key, latest_script_data_hash)
def dataflux_func_sync_cache(self, *args, **kwargs): lock_key = toolkit.get_cache_key('lock', 'syncCache') lock_value = toolkit.gen_uuid() if not self.cache_db.lock(lock_key, lock_value, 30): self.logger.warning('DataFluxFunc SyncCache Task already launched.') return self.logger.info('DataFluxFunc SyncCache Task launched.') # 脚本运行信息刷入数据库 try: self.sync_script_running_info() except Exception as e: for line in traceback.format_exc().splitlines(): self.logger.error(line) # 脚本失败信息刷入数据库 try: self.sync_script_failure() except Exception as e: for line in traceback.format_exc().splitlines(): self.logger.error(line) # 脚本日志刷入数据库 try: self.sync_script_log() except Exception as e: for line in traceback.format_exc().splitlines(): self.logger.error(line) # 任务信息刷入数据库 try: self.sync_task_info() except Exception as e: for line in traceback.format_exc().splitlines(): self.logger.error(line)
def dataflux_func_starter_crontab(self, *args, **kwargs): self.logger.info('DataFluxFunc Crontab Starter Task launched.') # 注:需要等待1秒,确保不会在整点运行,导致跳回上一触发点 time.sleep(1) # 计算当前触发点 now = arrow.get().to('Asia/Shanghai').datetime starter_crontab = crontab_parser.CronTab(CONFIG['_CRONTAB_STARTER']) trigger_time = int(starter_crontab.previous(delta=False, now=now)) current_time = int(time.time()) # 获取函数功能集成自动触发 integrated_crontab_configs = self.get_integrated_func_crontab_configs() # 循环获取需要执行的自动触发配置 next_seq = 0 while next_seq is not None: crontab_configs, next_seq = self.fetch_crontab_configs(next_seq) # 第一轮查询时,加入功能集成中自动执行的函数 if integrated_crontab_configs: crontab_configs = integrated_crontab_configs + crontab_configs integrated_crontab_configs = None # 分发任务 for c in crontab_configs: # 跳过未到达出发时间的任务 if not self.crontab_config_filter(trigger_time, c): continue # 确定执行队列 specified_queue = None try: specified_queue = c['funcExtraConfig']['queue'] except Exception as e: pass queue = None if specified_queue is None: queue = toolkit.get_worker_queue( CONFIG['_FUNC_TASK_DEFAULT_CRONTAB_QUEUE']) else: if isinstance( specified_queue, int ) and 0 <= specified_queue < CONFIG['_WORKER_QUEUE_COUNT']: # 直接指定队列编号 queue = toolkit.get_worker_queue(specified_queue) else: # 指定队列别名 try: queue_number = int( CONFIG['WORKER_QUEUE_ALIAS_MAP'][specified_queue]) except Exception as e: # 配置错误,无法解析为队列编号,或队列编号超过范围,使用默认函数队列。 # 保证无论如何都有Worker负责执行(实际运行会报错) queue = toolkit.get_worker_queue( CONFIG['_FUNC_TASK_DEFAULT_CRONTAB_QUEUE']) else: # 队列别名转换为队列编号 queue = toolkit.get_worker_queue(queue_number) # 确定超时时间 soft_time_limit = CONFIG['_FUNC_TASK_DEFAULT_TIMEOUT'] time_limit = CONFIG['_FUNC_TASK_DEFAULT_TIMEOUT'] + CONFIG[ '_FUNC_TASK_EXTRA_TIMEOUT_TO_KILL'] func_timeout = None try: func_timeout = c['funcExtraConfig']['timeout'] except Exception as e: pass # 存在且正确配置,更新超时时间 if isinstance(func_timeout, (six.integer_types, float)) and func_timeout > 0: soft_time_limit = func_timeout time_limit = func_timeout + CONFIG[ '_FUNC_TASK_EXTRA_TIMEOUT_TO_KILL'] # 计算任务过期时间 _shift_seconds = int(soft_time_limit * CONFIG['_FUNC_TASK_TIMEOUT_TO_EXPIRE_SCALE']) expires = arrow.get().shift(seconds=_shift_seconds).datetime # 上锁 lock_key = toolkit.get_cache_key('lock', 'CrontabConfig', ['crontabConfigId', c['id']]) lock_value = toolkit.gen_uuid() if not self.cache_db.lock(lock_key, lock_value, time_limit): # 触发任务前上锁,失败则跳过 continue # 任务ID task_id = gen_task_id() # 记录任务信息(入队) self.cache_task_status(c['id'], task_id, func_id=c['funcId']) # 任务入队 task_headers = { 'origin': '{}-{}'.format(c['id'], current_time) # 来源标记为「<自动触发配置ID>-<时间戳>」 } task_kwargs = { 'funcId': c['funcId'], 'funcCallKwargs': c['funcCallKwargs'], 'origin': c.get('execMode') or 'crontab', 'originId': c['id'], 'saveResult': c['saveResult'], 'execMode': 'crontab', 'triggerTime': trigger_time, 'crontab': c['crontab'], 'queue': specified_queue, 'lockKey': lock_key, 'lockValue': lock_value, } dataflux_func_runner.apply_async(task_id=task_id, kwargs=task_kwargs, headers=task_headers, queue=queue, soft_time_limit=soft_time_limit, time_limit=time_limit, expires=expires)
def get_server_cache_key(topic, name, tags=None): return toolkit.get_cache_key(topic, name, tags, APP_NAME_SERVER)
def update_script_dict_cache(self): ''' 更新脚本字典缓存 与 DataFluxFuncReloadScriptsTask 配合完成高速脚本加载处理 具体如下: 1. 从本地内存中获取缓存时间,未超时直接结束 2. 从Redis检查当前脚本缓存MD5值 2.1. 如未改变,则延长缓存时间并结束 2.2. 如已改变,则从Redis中获取脚本缓存数据 3. 如Redis中无脚本缓存数据,则直接从数据库中获取数据 (正常不会发生,DataFluxFuncReloadScriptsTask 会定时更新Redis缓存) ''' global SCRIPTS_CACHE_MD5 global SCRIPTS_CACHE_TIMESTAMP global SCRIPT_DICT_CACHE current_timestamp = time.time() cache_key_script_md5 = toolkit.get_cache_key('fixedCache', 'scriptsMD5') cache_key_script_dump = toolkit.get_cache_key('fixedCache', 'scriptsDump') # 1. 尝试使用本地缓存,不检查数据更新 if current_timestamp - SCRIPTS_CACHE_TIMESTAMP < CONFIG[ '_FUNC_TASK_LOCAL_CACHE_EXPIRES']: # 处于保留期内,跳过 self.logger.debug('[SCRIPT CACHE] Use local cache') return # 2. 检查Redis缓存 scripts_md5 = self.cache_db.get(cache_key_script_md5) if scripts_md5: scripts_md5 = six.ensure_str(scripts_md5) scripts_dump_exists = self.cache_db.exists(cache_key_script_dump) if scripts_md5 and scripts_md5 == SCRIPTS_CACHE_MD5 and scripts_dump_exists: # 存在缓存,且MD5未发生变化,延长本地缓存 SCRIPTS_CACHE_TIMESTAMP = current_timestamp self.logger.debug( '[SCRIPT CACHE] Not Modified, extend local cache') return # 3. 不存在缓存/缓存MD5发生变化,从Redis读取Dump scripts = None scripts_dump = self.cache_db.get(cache_key_script_dump) if scripts_dump: self.logger.debug('[SCRIPT CACHE] Modified, Use Redis cache') scripts_dump = six.ensure_str(scripts_dump) try: scripts = ujson.loads(scripts_dump) except Exception as e: pass if not scripts_md5: # 不存在缓存,自行计算(极少情况) scripts_md5 = toolkit.get_md5(scripts_dump) # 记录缓存MD5 SCRIPTS_CACHE_MD5 = scripts_md5 # 4. 未能从Redis读取Dump,从数据库获取完整用户脚本 if not scripts or not scripts_dump: self.logger.warning('[SCRIPT CACHE] Cache failed! Use DB data') scripts = self.get_scripts() # 自行计算并记录缓存MD5 scripts_dump = toolkit.json_safe_dumps(scripts, sort_keys=True) SCRIPTS_CACHE_MD5 = toolkit.get_md5(scripts_dump) # 记录到本地缓存 SCRIPTS_CACHE_TIMESTAMP = current_timestamp SCRIPT_DICT_CACHE = self.create_script_dict(scripts)
def sync_script_running_info(self): data = [] # 搜集数据 cache_key = toolkit.get_cache_key('syncCache', 'scriptRunningInfo') for i in range(CONFIG['_SYNC_CACHE_BATCH_COUNT']): cache_res = self.cache_db.run('rpop', cache_key) if not cache_res: break try: cache_res = toolkit.json_loads(cache_res) except Exception as e: for line in traceback.format_exc().splitlines(): self.logger.error(line) else: data.append(cache_res) # 计算最新版本号 func_latest_version_map = {} for d in data: func_id = d['funcId'] script_publish_version = d['scriptPublishVersion'] if func_id not in func_latest_version_map: func_latest_version_map[func_id] = script_publish_version else: func_latest_version_map[func_id] = max(script_publish_version, func_latest_version_map[func_id]) # 分类计算 data_map = {} for d in data: func_id = d['funcId'] script_publish_version = d['scriptPublishVersion'] exec_mode = d['execMode'] is_failed = d['isFailed'] cost = d['cost'] timestamp = d.get('timestamp') if not timestamp: continue latest_version = func_latest_version_map.get(func_id) if latest_version and script_publish_version < latest_version: continue if exec_mode is None: exec_mode = 'sync' pk = '~'.join([func_id, str(script_publish_version), exec_mode]) if pk not in data_map: data_map[pk] = { 'funcId' : func_id, 'scriptPublishVersion': script_publish_version, 'execMode' : exec_mode, } if 'succeedCount' not in data_map[pk]: data_map[pk]['succeedCount'] = 0 if 'failCount' not in data_map[pk]: data_map[pk]['failCount'] = 0 data_map[pk]['latestFailTimestamp'] = None data_map[pk]['latestSucceedTimestamp'] = None if is_failed: data_map[pk]['failCount'] += 1 data_map[pk]['latestFailTimestamp'] = timestamp data_map[pk]['status'] = 'failed' else: data_map[pk]['succeedCount'] += 1 data_map[pk]['latestSucceedTimestamp'] = timestamp data_map[pk]['status'] = 'succeeded' if 'minCost' not in data_map[pk]: data_map[pk]['minCost'] = cost else: data_map[pk]['minCost'] = min(data_map[pk]['minCost'], cost) if 'maxCost' not in data_map[pk]: data_map[pk]['maxCost'] = cost else: data_map[pk]['maxCost'] = max(data_map[pk]['maxCost'], cost) if 'totalCost' not in data_map[pk]: data_map[pk]['totalCost'] = cost else: data_map[pk]['totalCost'] += cost data_map[pk]['latestCost'] = cost # 分类入库 for pk, d in data_map.items(): func_id = d['funcId'] script_publish_version = d['scriptPublishVersion'] exec_mode = d['execMode'] sql = ''' SELECT `succeedCount` ,`failCount` ,`minCost` ,`maxCost` ,`totalCost` ,`latestCost` ,UNIX_TIMESTAMP(`latestSucceedTime`) AS `latestSucceedTimestamp` ,UNIX_TIMESTAMP(`latestFailTime`) AS `latestFailTimestamp` ,`status` FROM biz_rel_func_running_info WHERE `funcId` = ? AND `scriptPublishVersion` = ? AND `execMode` = ? LIMIT 1 ''' sql_params = [ func_id, script_publish_version, exec_mode, ] prev_info = self.db.query(sql, sql_params) # 删除已过时记录 if not prev_info: # 无记录,则补全记录 sql = ''' INSERT IGNORE INTO biz_rel_func_running_info SET `funcId` = ? ,`scriptPublishVersion` = ? ,`execMode` = ? ,`succeedCount` = ? ,`failCount` = ? ,`minCost` = ? ,`maxCost` = ? ,`totalCost` = ? ,`latestCost` = ? ,`latestSucceedTime` = FROM_UNIXTIME(?) ,`latestFailTime` = FROM_UNIXTIME(?) ,`status` = ? ''' sql_params = [ func_id, script_publish_version, exec_mode, d['succeedCount'], d['failCount'], d['minCost'], d['maxCost'], d['totalCost'], d['latestCost'], d['latestSucceedTimestamp'], d['latestFailTimestamp'], d['status'], ] self.db.query(sql, sql_params) else: prev_info = prev_info[0] # 有记录,合并 sql = ''' UPDATE biz_rel_func_running_info SET `succeedCount` = ? ,`failCount` = ? ,`minCost` = ? ,`maxCost` = ? ,`totalCost` = ? ,`latestCost` = ? ,`latestSucceedTime` = FROM_UNIXTIME(?) ,`latestFailTime` = FROM_UNIXTIME(?) ,`status` = ? WHERE `funcId` = ? AND `scriptPublishVersion` = ? AND `execMode` = ? LIMIT 1 ''' sql_params = [ d['succeedCount'] + (prev_info['succeedCount'] or 0), d['failCount'] + (prev_info['failCount'] or 0), min(filter(lambda x: x is not None, (d['minCost'], prev_info['minCost']))), max(filter(lambda x: x is not None, (d['maxCost'], prev_info['maxCost']))), d['totalCost'] + (prev_info['totalCost'] or 0), d['latestCost'], d['latestSucceedTimestamp'] or prev_info['latestSucceedTimestamp'], d['latestFailTimestamp'] or prev_info['latestFailTimestamp'], d['status'], func_id, script_publish_version, exec_mode, ] self.db.query(sql, sql_params) # 删除过时数据 for func_id, latest_version in func_latest_version_map.items(): sql = ''' DELETE FROM biz_rel_func_running_info WHERE `funcId` = ? AND `scriptPublishVersion` != ? OR UNIX_TIMESTAMP() - UNIX_TIMESTAMP(updateTime) > ? ''' sql_params = [ func_id, latest_version, 3600 * 24 * 30, ] self.db.query(sql, sql_params)
def reload_script(self): global SCRIPT_MAP # 1. 获取当前所有脚本ID和MD5 sql = ''' SELECT `scpt`.`id` ,`scpt`.`codeMD5` ,`scpt`.`publishVersion` ,`sset`.`id` AS `scriptSetId` FROM biz_main_script AS scpt JOIN biz_main_script_set as sset ''' db_res = self.db.query(sql) current_script_ids = set() reload_script_ids = set() for d in db_res: script_id = d['id'] current_script_ids.add(script_id) cached_script = SCRIPT_MAP.get(script_id) if not cached_script: # 新脚本 reload_script_ids.add(script_id) elif cached_script['codeMD5'] != d['codeMD5'] or cached_script['publishVersion'] != d['publishVersion']: # 更新脚本 reload_script_ids.add(script_id) # 去除已经不存在的脚本 script_ids_to_pop = [] for script_id in SCRIPT_MAP.keys(): if script_id not in current_script_ids: self.logger.debug('[SCRIPT CACHE] Remove {}'.format(script_id)) script_ids_to_pop.append(script_id) for script_id in script_ids_to_pop: SCRIPT_MAP.pop(script_id, None) if reload_script_ids: # 2. 从数据库获取更新后的脚本 scripts = self.get_scripts(script_ids=reload_script_ids) for s in scripts: self.logger.debug('[SCRIPT CACHE] Load {}'.format(s['id'])) # 合并加载的脚本 reloaded_script_map = dict([(s['id'], s) for s in scripts]) SCRIPT_MAP.update(reloaded_script_map) # 3. Dump和MD5值写入缓存 self._cache_scripts() # 4. 删除函数结果缓存 for script_id in reload_script_ids: func_id_pattern = '{0}.*'.format(script_id) cache_key = toolkit.get_cache_key('cache', 'funcResult', tags=[ 'funcId', func_id_pattern, 'scriptCodeMD5', '*', 'funcKwargsMD5', '*']) for k in self.cache_db.client.scan_iter(cache_key): self.cache_db.delete(six.ensure_str(k))
def on_heartbeat_sent(*args, **kwargs): global MAIN_PROCESS global CHILD_PROCESSES global HEARTBEAT_EXEC_TIMESTAMP # Limit run interval current_timestamp = int(time.time()) if current_timestamp - HEARTBEAT_EXEC_TIMESTAMP < CONFIG[ '_MONITOR_SYS_STATS_CHECK_INTERVAL']: return HEARTBEAT_EXEC_TIMESTAMP = current_timestamp # Get queue list _Q_flag = '-Q' # Record worker count worker_queues = [] if _Q_flag in sys.argv: worker_queues = sys.argv[sys.argv.index(_Q_flag) + 1].split(',') worker_queues = list(map(lambda x: x.split('@').pop(), worker_queues)) worker_queues.sort() else: worker_queues = [str(i) for i in range(CONFIG['_WORKER_QUEUE_COUNT'])] _expires = 30 for q in worker_queues: cache_key = toolkit.get_cache_key( 'heartbeat', 'workerOnQueue', tags=['workerId', WORKER_ID, 'workerQueue', q]) REDIS_HELPER.setex(cache_key, _expires, 'x') cache_pattern = toolkit.get_cache_key( 'heartbeat', 'workerOnQueue', tags=['workerId', '*', 'workerQueue', q]) found_workers = REDIS_HELPER.keys(cache_pattern) cache_key = toolkit.get_cache_key('heartbeat', 'workerOnQueueCount', tags=['workerQueue', q]) REDIS_HELPER.setex(cache_key, _expires, len(found_workers)) # Record CPU/Memory if MAIN_PROCESS: total_cpu_percent = MAIN_PROCESS.cpu_percent() main_memory_info = MAIN_PROCESS.memory_full_info() total_memory_pss = main_memory_info.pss # Update child process map next_child_process_map = dict([(p.pid, p) for p in MAIN_PROCESS.children()]) prev_child_pids = set(CHILD_PROCESS_MAP.keys()) next_child_pids = set(next_child_process_map.keys()) exited_pids = prev_child_pids - next_child_pids for pid in exited_pids: CHILD_PROCESS_MAP.pop(pid, None) new_pids = next_child_pids - prev_child_pids for pid in new_pids: new_child_process = next_child_process_map[pid] new_child_process.cpu_percent(interval=1) CHILD_PROCESS_MAP[pid] = new_child_process # Count up for p in CHILD_PROCESS_MAP.values(): child_cpu_percent = p.cpu_percent() child_memory_info = p.memory_full_info() total_cpu_percent += child_cpu_percent total_memory_pss += child_memory_info.pss total_cpu_percent = round(total_cpu_percent, 2) hostname = socket.gethostname() cache_key = toolkit.get_server_cache_key( 'monitor', 'sysStats', ['metric', 'workerCPUPercent', 'hostname', hostname]) REDIS_HELPER.ts_add(cache_key, total_cpu_percent, timestamp=current_timestamp) cache_key = toolkit.get_server_cache_key( 'monitor', 'sysStats', ['metric', 'workerMemoryPSS', 'hostname', hostname]) REDIS_HELPER.ts_add(cache_key, total_memory_pss, timestamp=current_timestamp)
def sync_script_failure(self): if not CONFIG['_INTERNAL_KEEP_SCRIPT_FAILURE']: return cache_key = toolkit.get_cache_key('syncCache', 'scriptFailure') for i in range(CONFIG['_BUILTIN_TASK_SYNC_CACHE_BATCH_COUNT']): cache_res = self.cache_db.run('rpop', cache_key) if not cache_res: break try: cache_res = ujson.loads(cache_res) except Exception as e: for line in traceback.format_exc().splitlines(): self.logger.error(line) continue func_id = cache_res['funcId'] script_publish_version = cache_res['scriptPublishVersion'] exec_mode = cache_res['execMode'] einfo_text = cache_res.get('einfoTEXT') trace_info = cache_res.get('traceInfo') timestamp = cache_res.get('timestamp') if not all([einfo_text, timestamp]): continue if exec_mode is None: exec_mode = 'sync' # 记录脚本故障 failure_id = gen_script_failure_id() exception = None if trace_info: exception = trace_info.get('exceptionDump') or '' if isinstance(exception, six.string_types): exception = exception.split(':')[0] else: exception = None trace_info = simplejson.dumps(trace_info, default=toolkit.json_dump_default) sql = ''' INSERT INTO biz_main_script_failure SET `id` = ? ,`funcId` = ? ,`scriptPublishVersion` = ? ,`execMode` = ? ,`einfoTEXT` = ? ,`exception` = ? ,`traceInfoJSON` = ? ,`createTime` = FROM_UNIXTIME(?) ,`updateTime` = FROM_UNIXTIME(?) ''' sql_params = [ failure_id, func_id, script_publish_version, exec_mode, einfo_text, exception, trace_info, timestamp, timestamp, ] self.db.query(sql, sql_params)
def sync_script_log(self): if not CONFIG['_INTERNAL_KEEP_SCRIPT_LOG']: return cache_key = toolkit.get_cache_key('syncCache', 'scriptLog') # 当队列数量过大时,一些内容不再记录 queue_length = 0 cache_res = self.cache_db.run('llen', cache_key) if cache_res: queue_length = int(cache_res) is_service_degraded = queue_length > CONFIG['_BUILTIN_TASK_SYNC_CACHE_SERVICE_DEGRADE_QUEUE_LENGTH'] for i in range(CONFIG['_BUILTIN_TASK_SYNC_CACHE_BATCH_COUNT']): cache_res = self.cache_db.run('rpop', cache_key) if not cache_res: break # 发生服务降级时,随机丢弃 if is_service_degraded: if random.randint(0, queue_length) * 2 > CONFIG['_BUILTIN_TASK_SYNC_CACHE_SERVICE_DEGRADE_QUEUE_LENGTH']: continue try: cache_res = ujson.loads(cache_res) except Exception as e: for line in traceback.format_exc().splitlines(): self.logger.error(line) continue func_id = cache_res['funcId'] script_publish_version = cache_res['scriptPublishVersion'] exec_mode = cache_res['execMode'] log_messages = cache_res.get('logMessages') timestamp = cache_res.get('timestamp') if not all([log_messages, timestamp]): continue if exec_mode is None: exec_mode = 'sync' # 记录脚本日志 log_id = gen_script_log_id() message_text = '\n'.join(log_messages).strip() sql = ''' INSERT INTO biz_main_script_log SET `id` = ? ,`funcId` = ? ,`scriptPublishVersion` = ? ,`execMode` = ? ,`messageTEXT` = ? ,`createTime` = FROM_UNIXTIME(?) ,`updateTime` = FROM_UNIXTIME(?) ''' sql_params = [ log_id, func_id, script_publish_version, exec_mode, message_text, timestamp, timestamp, ] self.db.query(sql, sql_params)
def sync_task_info(self): cache_key = toolkit.get_cache_key('syncCache', 'taskInfo') # 当队列数量过大时,一些内容不再记录 queue_length = 0 cache_res = self.cache_db.run('llen', cache_key) if cache_res: queue_length = int(cache_res) is_service_degraded = queue_length > CONFIG['_BUILTIN_TASK_SYNC_CACHE_SERVICE_DEGRADE_QUEUE_LENGTH'] for i in range(CONFIG['_BUILTIN_TASK_SYNC_CACHE_BATCH_COUNT']): cache_res = self.cache_db.run('rpop', cache_key) if not cache_res: break try: cache_res = ujson.loads(cache_res) except Exception as e: for line in traceback.format_exc().splitlines(): self.logger.error(line) continue task_id = cache_res['taskId'] origin = cache_res['origin'] origin_id = cache_res['originId'] func_id = cache_res.get('funcId') script_publish_version = cache_res.get('scriptPublishVersion') exec_mode = cache_res.get('execMode') status = cache_res['status'] log_messages = cache_res.get('logMessages') or [] einfo_text = cache_res.get('einfoTEXT') or '' timestamp = cache_res.get('timestamp') if not all([origin, exec_mode, origin_id, timestamp]): continue if origin not in ('crontab', 'batch') and exec_mode != 'crontab': return message_text = '\n'.join(log_messages).strip() # 记录任务信息 table_name = None origin_id_field = None if origin == 'crontab' or exec_mode == 'crontab': table_name = 'biz_main_crontab_task_info' origin_id_field = 'crontabConfigId' elif origin == 'batch': table_name = 'biz_main_batch_task_info' origin_id_field = 'batchId' sql = None sql_params = None # 根据是否服务降级区分处理 if not is_service_degraded: # 未发生服务降级,正常处理 if status == 'queued': sql = ''' INSERT INTO ?? SET `id` = ? ,`??` = ? ,`funcId` = ? ,`scriptPublishVersion` = ? ,`queueTime` = FROM_UNIXTIME(?) ,`createTime` = FROM_UNIXTIME(?) ,`updateTime` = FROM_UNIXTIME(?) ''' sql_params = [ table_name, task_id, origin_id_field, origin_id, func_id, script_publish_version, timestamp, timestamp, timestamp, ] elif status == 'pending': sql = ''' UPDATE ?? SET `funcId` = IFNULL(?, `funcId`) ,`scriptPublishVersion` = IFNULL(?, `scriptPublishVersion`) ,`startTime` = FROM_UNIXTIME(?) ,`status` = ? ,`updateTime` = FROM_UNIXTIME(?) WHERE `id` = ? ''' sql_params = [ table_name, func_id, script_publish_version, timestamp, status, timestamp, task_id ] else: sql = ''' UPDATE ?? SET `funcId` = IFNULL(?, `funcId`) ,`scriptPublishVersion` = IFNULL(?, `scriptPublishVersion`) ,`endTime` = FROM_UNIXTIME(?) ,`status` = ? ,`logMessageTEXT` = ? ,`einfoTEXT` = ? ,`updateTime` = FROM_UNIXTIME(?) WHERE `id` = ? ''' sql_params = [ table_name, func_id, script_publish_version, timestamp, status, message_text, einfo_text, timestamp, task_id, ] else: # 发生服务降级,处理最终结果 if status in ('success', 'failure'): sql = ''' REPLACE INTO ?? SET `id` = ? ,`??` = ? ,`funcId` = ? ,`scriptPublishVersion` = ? ,`endTime` = FROM_UNIXTIME(?) ,`status` = ? ,`logMessageTEXT` = ? ,`einfoTEXT` = ? ,`createTime` = FROM_UNIXTIME(?) ,`updateTime` = FROM_UNIXTIME(?) ''' sql_params = [ table_name, task_id, origin_id_field, origin_id, func_id, script_publish_version, timestamp, status, message_text, einfo_text, timestamp, timestamp, ] else: continue self.db.query(sql, sql_params)