def jobReadyWatch(children): try: # brokers={} for jobId in children: try: job=StoreHolder.getServerStore().getJob(jobId) if CacheHolder.getCache().hasKey(jobId, JOBS) is False: CacheHolder.getCache().put(jobId, job,JOBS) for taskName in job.tasks: TaskCacheHolder.getJobCache().put(taskName,job.jobId) except Exception: Logger.exception( log) #偷个懒,只要没有删除的全部放到ROUTER里面去 # if job.status != JOB_DELETE: # # for taskName in job.tasks: # que=StoreHolder.getServerStore().getQueue(job.brokerQueue) # TaskCacheHolder.getJobCache().put(taskName,job.jobId) # # if brokerServer in brokers: # brokers[brokerServer].update(routes) # else: # brokers[brokerServer] = routes # # #偷个懒,只要没有删除的全部放到ROUTER里面去 # for broker,routes in brokers.items(): # brokerServer = StoreHolder.getServerStore().getBrokerServer(broker) # CabbageHolder.getServerCabbages()[brokerServer.hostName].getApp().conf.update(CELERY_ROUTES = routes) # Logger.info(log,"更新队列服务器【%s】ROUTES【%s】"% (CabbageHolder.getServerCabbagesStr(),str(routes))) except Exception: Logger.exception( log)
def workWatch(children): for hostname in children: if CacheHolder.getCache().hasKey(hostname, WORKS) is False: work = StoreHolder.getServerStore().getWork(hostname) CacheHolder.getCache().put(hostname, work,WORKS) parent="/"+CABBAGE+"/"+WORKS+"/"+hostname kazooClient.addDataListener(parent+"/"+ON_LINE, workOnlineWatch)
def _initJobs(self): store = StoreHolder.getRetryStore() jobs =store.getJobs() work =store.getWork(HOST_NAME) queues=work.queues routes={} for job in jobs: if job.status != JOB_DELETE and job.brokerQueue in queues: #fixbug 动态扩容时,缓存JOB if not CacheHolder.getCache().hasKey(job.jobId, JOBS): CacheHolder.getCache().put(job.jobId, job,JOBS) clientDir = ConfigHolder.getConfig().getProperty(BASE,CLIENT_FILE_DIRECTORY) path = clientDir+"/"+job.jobId if not os.path.isdir(path) : # @FIX BUG 文件不同步 syncJob(job.jobId,store) self.addScriptJobId(job.jobId) for taskName in job.tasks: que= store.getQueue(job.brokerQueue) routes[taskName]={'queue': que.queueName, 'routing_key': que.routingKey} celeryconfig.CELERY_ROUTES = routes
def syncJob(jobId, store=None): jobId = jobId if store is None: with storeFactory.store() as store: work = store.getWork(HOST_NAME) store.removeJobWorkReady(jobId, work) else: work = store.getWork(HOST_NAME) store.removeJobWorkReady(jobId, work) job = CacheHolder.getCache().get(jobId, JOBS) fileNames = [] #不同步主节点文件 # CacheHolder.getCache().put(job.fileName, "", jobId) # try: # syncFile(job.fileName,jobId,FileRequestMessage.MAIN) # except Exception: # syncFile(job.fileName,jobId,FileRequestMessage.MAIN) # # fileNames.append(job.fileName) if job.attachFiles: for attachFile in job.attachFiles: CacheHolder.getCache().put(attachFile.fileName, "", jobId) try: syncFile(attachFile.fileName, jobId, FileRequestMessage.ATTACH) except Exception: syncFile(attachFile.fileName, jobId, FileRequestMessage.ATTACH) fileNames.append(attachFile.fileName) checkAllFileIsReady(fileNames, jobId)
def updateJobCache(jobId): store = storeFactory.getStore() try: job=store.getJob(jobId) CacheHolder.getCache().put(jobId, job,JOBS) finally: storeFactory.returnStroe(store)
def getCacheJobs(self): jobIds = CacheHolder.getCache().getRegion(JOBS).namespace.keys() jobs = [] for jobId in jobIds: jobs.append(CacheHolder.getCache().get(jobId, JOBS)) return jobs
def workBrokerQueueChangeHandler(event): # CacheHolder.getCache().put(QUEUES,event.brokerQueues,WORKS) with storeFactory.store() as store: work = store.getWork(HOST_NAME) CacheHolder.getCache().put(HOST_NAME,work,WORKS) if event.isEvent and (work.status == ON_LINE): Logger.info(log,"restart") CabbageControlHolder.getCabbageControl().restartCelery()
def workWatch(children): store = storeFactory.getStore() try: for hostname in children: if CacheHolder.getCache().hasKey(hostname, WORKS) is False: work = store.getWork(hostname) CacheHolder.getCache().put(hostname, work,WORKS) finally: storeFactory.returnStroe(store)
def getJobByPage(self, store, limit, offset): jobIds = store.getJobIds() totalCount = len(jobIds) ids = jobIds[offset:offset + limit] jobs = [] for jobId in ids: # jobs.append(CacheHolder.getCache().get(jobId, JOBS)) job = store.getJob(jobId) jobs.append(job) CacheHolder.getCache().put(jobId, job, JOBS) return (jobs, totalCount)
def workBrokerServerChangeHandler(event): print "------workBrokerServerChangeHandler-----------" hostName = HOST_NAME # work = StoreHolder.getStore().getWork(hostName) with storeFactory.store() as store: work = store.getWork(hostName) # CacheHolder.getCache().put(QUEUES,work.queues,WORKS) CacheHolder.getCache().put(hostName, work, WORKS) if event.isEvent and (work.status == READY or work.status == ON_LINE): # parent="/"+CABBAGE+"/"+WORKS+"/"+hostName # from cabbage.watch.client_jobs_watch import workBrokerServerWatch # ZookeeperClientHolder.getClient().addDataListener(parent+"/"+BROKER_SERVER, workBrokerServerWatch) CabbageControlHolder.getCabbageControl().restartCelery()
def workStatusHandler(event): if event: work = CacheHolder.getCache().get(event.hostName,WORKS) if work: work.status=event.status with storeFactory.store() as store: store.updateWorkStatus(work)
def doAction(jobId): if not CacheHolder.getCache().hasKey(jobId, JOBS): with storeFactory.store() as store: job = store.getJob(jobId) CacheHolder.getCache().put(jobId, job, JOBS) job = CacheHolder.getCache().get(jobId, JOBS) Logger.debug(log, "upload files. job【%s】" % str(job.asDict())) if job.resultBackend == None: return elif job.resultBackend == NFS: CabbageNfsBackend(jobId).save() elif job.resultBackend == HDFS: CabbageHdfsBackend(jobId).save()
def addQueue(self, store, brokerServer, brokerQueue, exchange, routingKey, nodes, priority): exchange = exchange if exchange else brokerQueue routingKey = routingKey if routingKey else brokerQueue workApi = WorkApi() works = [] if nodes and nodes[0] == "-1": works = workApi.getWorks() elif nodes: for node in nodes: workName = node.split(":")[0] works.append(workApi.getWork(workName)) else: works = workApi.getWorks() brokerServer = brokerServer.split(":")[0] server = store.getBrokerServer(brokerServer) client = KombuClient(url=server.connectUri) client.addQueue(brokerQueue, exchangeName=exchange, routingKey=routingKey, priority=priority) queue = BrokerQueue(server=brokerServer, queueName=brokerQueue, exchangeName=exchange, routingKey=routingKey, works=[work.hostName for work in works], priority=priority) store.saveQueue(queue) store.addServerBrokerQueue(server, queue) queues = {} for work in works: jobIds = store.getJobIds() for jobId in jobIds: job = CacheHolder.getCache().get(jobId, JOBS) workIsRun = False for w in job.works: if work.hostName == w.hostName and job.status != JOB_DELETE: workIsRun = True if work.brokerServer != brokerServer and workIsRun: raise Exception("有未删除的JOB【%s】在使用该节点,导致节点无法切换服务器,添加队列失败!" % job.jobId) if work.brokerServer != brokerServer: for q in work.queues: if q in queues: queues[q] = queues[q] + [work.hostName] else: queues[q] = [work.hostName] store.addWorkBrokerServerBrokerQueue(work, brokerServer, brokerQueue) for key, values in queues.items(): for v in values: store.removeWorkFromQueue(key, str(v))
def _getConnectUri(self): connectUri = ConfigHolder.getConfig().getProperty(BASE, CONNECT_URI) work = CacheHolder.getCache().get(HOST_NAME, WORKS) if work.brokerServer: brokerServer = StoreHolder.getStore().getBrokerServer( work.brokerServer) connectUri = brokerServer.connectUri return connectUri
def checkAllFileIsReady(fileNames, jobId, num=0): if num > 10: return for fileName in fileNames: value = CacheHolder.getCache().get(fileName, jobId) if value != DONE: time.sleep(10) checkAllFileIsReady(fileNames, jobId, num + 1)
def jobUpdateHandler(event): jobId = event.jobId status = event.status if status == JOB_DELETE: jobRun = JobCacheHolder.getJobCache().get(jobId) if jobRun : #停止运行TASK jobRun.stop() with storeFactory.store() as store: store.updateJobStatus(jobId, JOB_DELETE) #删除缓存让下一个task可以同名 tasks=CacheHolder.getCache().get(jobId, JOBS).tasks for taskName in tasks: if TaskCacheHolder.getJobCache().has_key(taskName): TaskCacheHolder.getJobCache().remove(taskName) with storeFactory.store() as store: job=store.getJob(jobId) CacheHolder.getCache().put(jobId, job,JOBS)
def jobRunHandler(event): jobId = event.jobId params = event.params ignoreNotPerWork = event.ignoreNotPerWork if not CacheHolder.getCache().hasKey(jobId, JOBS): with storeFactory.store() as store: job=store.getJob(jobId) CacheHolder.getCache().put(jobId, job,JOBS) else: job = CacheHolder.getCache().get(jobId,JOBS) if ignoreNotPerWork: runJob(job,params) else: (isAllBeReady,works)=checkAllWorkBeReady(job) if isAllBeReady: runJob(job,params) else: raise Exception("works:%s not be ready"% ",".join(works))
def run(self): clientDir = ConfigHolder.getConfig().getProperty( BASE, CLIENT_FILE_DIRECTORY) jobId = self.message.jobId jobDir = clientDir + "/" + jobId fileName = self.message.fileName fileContent = self.message.fileContent if os.path.exists(jobDir) is False: os.mkdir(jobDir) os.mkdir(jobDir + "/result") filePath = jobDir + "/" + fileName if os.path.exists(filePath): os.remove(filePath) f = open(filePath, "w") f.write(base64.decodestring(fileContent)) f.close() #need notify file is ready and all file is ready #then notify server this client to be ready exec job CacheHolder.getCache().put(fileName, DONE, jobId)
def jobWebWatch(children): store = storeFactory.getStore() try: brokers={} for jobId in children: try: job=store.getJob(jobId)#toreHolder.getStore().getJob(jobId) if CacheHolder.getCache().hasKey(jobId, JOBS) is False: CacheHolder.getCache().put(jobId, job,JOBS) # kazooClient.addDataListener(parent+"/"+STATUS, jobRunStatusWatch) #偷个懒,只要没有删除的全部放到ROUTER里面去 if job.status != JOB_DELETE: brokerServer=job.brokerServer routes={} for taskName in job.tasks: que=store.getQueue(job.brokerQueue) routes[taskName]={'queue': que.queueName, 'routing_key': que.routingKey} TaskCacheHolder.getJobCache().put(taskName,job.jobId) if brokerServer in brokers: brokers[brokerServer].update(routes) else: brokers[brokerServer] = routes except Exception: Logger.exception( log) #偷个懒,只要没有删除的全部放到ROUTER里面去 for broker,routes in brokers.items(): brokerServer = store.getBrokerServer(broker) #修复BUG,导致任务提交的celery队列里面去了 cabbage = Cabbage(hostName=brokerServer.hostName,broker=brokerServer.connectUri) cabbage.app.conf.update(CELERY_ROUTES = routes) CabbageHolder.getServerCabbages()[brokerServer.hostName] = cabbage # CabbageHolder.getServerCabbages()[brokerServer.hostName].getApp().conf.update(CELERY_ROUTES = routes) Logger.info(log,"更新队列服务器【%s】ROUTES【%s】"% (brokerServer.hostName,str(routes))) except Exception: Logger.exception( log) finally: storeFactory.returnStroe(store)
def _initJobs(self, cabbage): store = StoreHolder.getRetryStore() jobs = store.getJobs() work = store.getWork(HOST_NAME) queues = work.queues routes = {} queues_celery = [] for que in queues: que = store.getQueue(que) queues_celery.append( Queue(que.queueName, Exchange(que.queueName), routing_key=que.queueName, queue_arguments={'x-max-priority': int(que.priority)})) for job in jobs: if job.status != JOB_DELETE and job.brokerQueue in queues: #fixbug 动态扩容时,缓存JOB if not CacheHolder.getCache().hasKey(job.jobId, JOBS): CacheHolder.getCache().put(job.jobId, job, JOBS) clientDir = ConfigHolder.getConfig().getProperty( BASE, CLIENT_FILE_DIRECTORY) path = clientDir + "/" + job.jobId if not os.path.isdir(path): # @FIX BUG 文件不同步 syncJob(job.jobId, store) self.addScriptJobId(job.jobId, cabbage) for taskName in job.tasks: que = store.getQueue(job.brokerQueue) routes[taskName] = { 'queue': que.queueName, 'routing_key': que.routingKey } log.info(routes) celeryconfig.CELERY_QUEUES = tuple(queues_celery) celeryconfig.CELERY_ROUTES = routes
def taskSucceeded(state, event, app): taskId = event['uuid'] task = state.tasks.get(taskId) # taskName = task.name if task and hasattr(task,'name') else None jobId, taskName = _getJobIdAndTaskName(taskId) queueTime = 0 runtime = 0 # state. #FIXME 经常找不到TASK.name #@TODO log.debug("【%s】 TASK SUCCEEDED !" % (event['uuid'])) try: # jobId= None # if hasattr(task,'kwargs') and task.kwargs is not None and JOB_ID in task.kwargs: # jobId= eval(str(task.kwargs))[JOB_ID] # # if jobId is None and taskName: # jobId = TaskCacheHolder.getJobCache().get(taskName) # # if taskName is None or jobId is None: # jobId,taskName = _getJobIdAndTaskName(taskId) # print jobId,taskName job = CacheHolder.getCache().get(jobId, JOBS) result = AsyncResult(taskId, app=CabbageHolder.getServerCabbage( job.brokerServer).getApp()) if not isinstance(result.backend, DisabledBackend): log.debug("【%s】 TASK SUCCEEDED result【%s】 !" % (event['uuid'], result.result)) if result.result: jobResults.addResult(jobId, result.result) # # print task.started - task.received # print queueTime # print "task.started:%s"%task.started # print "task.received:%s"%task.received # print "task.runtime:%s" % event['runtime'] # print event if task and task.started and task.received: queueTime = task.started - task.received runtime = event['runtime'] # with storeFactory.store() as store: # store.deleteTaskId( taskId) CabbageCounterHolder.getCabbageCounter().updateTaskSucceeded( taskName, _getHostName(event), runtime, queueTime) # raise Exception("test") except Exception as e: Logger.exception(log)
def start(self): work = Work() work.hostName = HOST_NAME work.port = "1024" work.ip = LOCAL_IP work.status = READY with storeFactory.store() as store: if not store.isExistWork(work): store.saveWork(work) CacheHolder.getCache().put(HOST_NAME, work, WORKS) else: CacheHolder.getCache().put(HOST_NAME, store.getWork(HOST_NAME), WORKS) store.updateWorkStatus(work) self.runCeleryServer(work, store) t2 = threading.Thread(target=scheduler) t2.setDaemon(True) t2.start() tornado.ioloop.IOLoop.current().start()
def getCabbage(self): connectUri = CabbageHolder._getConnectUri() work = CacheHolder.getCache().get(HOST_NAME, WORKS) if work.queues and len(work.queues) > 0: queues = [] for queueName in work.queues: brokerQueue = StoreHolder.getStore().getQueue(queueName) queues.append( Queue(name=brokerQueue.queueName, exchange=Exchange(brokerQueue.exchangeName), routing_key=brokerQueue.routingKey)) celeryconfig.CELERY_QUEUES = tuple(queues) return Cabbage(broker=str(connectUri))
def jobRemoveHandler(event): try: jobId = event.jobId if JobCacheHolder.getJobCache().has_key(jobId): jobRun = JobCacheHolder.getJobCache().get(jobId) if jobRun : #停止运行TASK jobRun.stop() else: job =CacheHolder.getCache().get(jobId, JOBS) for taskName in job.tasks: CabbageHolder.getServerCabbage(job.brokerServer).revokeByTaskName(taskName) with storeFactory.store() as store: store.updateJobStatus(jobId, JOB_DELETE) #删除缓存让下一个task可以同名 tasks=CacheHolder.getCache().get(jobId, JOBS).tasks for taskName in tasks: if TaskCacheHolder.getJobCache().has_key(taskName): TaskCacheHolder.getJobCache().remove(taskName) CacheHolder.getCache().remove(jobId, JOBS) except: Logger.exception(log)
def clentWorkStatusChangeHandler(event): from cabbage.client_start import CabbageClientHolder hostName = HOST_NAME # work = StoreHolder.getStore().getWork(hostName) with storeFactory.store() as store: work = store.getWork(hostName) CacheHolder.getCache().put(hostName, work, WORKS) clentStatus = CabbageClientHolder.getClient().status if event.status == OFF_LINE and clentStatus == ON_LINE: CabbageControlHolder.getCabbageControl().stopCelery(hostName) CabbageClientHolder.getClient().status = OFF_LINE if event.status == ON_LINE and clentStatus == OFF_LINE: # def run(): CabbageClientHolder.getClient().status = ON_LINE CabbageControlHolder.getCabbageControl().restartCelery() # t1 = threading.Thread(target=run) # t1.setDaemon(True) # t1.start() if event.status == REMOVE and clentStatus != REMOVE: CabbageClientHolder.getClient().stop()
def jobChildWatch(children): try: for jobId in children: jobId = str(jobId) with storeFactory.store() as store: job = store.getJob(jobId) work = store.getWork(HOST_NAME) if CacheHolder.getCache().hasKey( jobId, JOBS) is False and job.brokerQueue in work.queues: '''添加job的状态监控''' parent = "/" + CABBAGE + "/" + JOBS + "/" + jobId kazooClient.addDataListener(parent + "/" + STATUS, jobRunStatusWatch) kazooClient.addDataListener(parent + "/" + AUDIT_STATUS, jobAduitStatusWatch) updateJobCache(jobId, job) except Exception: Logger.exception(log)
def taskFailed(state, event, app): eventOutDic = event.copy() taskName = None try: taskId = event['uuid'] task = state.tasks.get(taskId) # # taskName = task.name if task and hasattr(task,'name') else None # # if hasattr(task,'kwargs') and task.kwargs is not None and JOB_ID in task.kwargs: # eventOutDic[JOB_ID] = eval(str(task.kwargs))[JOB_ID] # # if eventOutDic[JOB_ID] is None: # eventOutDic[JOB_ID] = TaskCacheHolder.getJobCache().get(taskName) # if taskName is None or eventOutDic[JOB_ID] is None : jobId, taskName = _getJobIdAndTaskName(taskId) eventOutDic[JOB_ID] = jobId job = CacheHolder.getCache().get(eventOutDic[JOB_ID], JOBS) brokerServer = job.brokerServer taskPath = ConfigHolder.getConfig().getProperty( BASE, TASK_FAILLOG_PATH) if not os.path.isdir(taskPath): os.makedirs(taskPath) dateStr = getNowDateStr() with open(taskPath + "/" + brokerServer + "_" + dateStr + ".log", "a+") as writer: writer.write(str(eventOutDic) + "\n") # with storeFactory.store() as store: # store.deleteTaskId(task.id) # StoreHolder.getRedisStaticStore().deleteTaskId(taskId) CabbageCounterHolder.getCabbageCounter().updateTaksFail( taskName, _getHostName(event)) except Exception as e: Logger.exception(log)
def uploadClientScheduler(): jobIds = CacheHolder.getCache().getRegion(JOBS).namespace.keys() for jobId in jobIds: # executor.submit(doAction,jobId) JobUploadPoolHolder.getJobUploadPool().add(jobId)
def updateJobCache(jobId): job=StoreHolder.getServerStore().getJob(jobId) CacheHolder.getCache().put(jobId, job,JOBS)
def getJobByJobId(self, store, jobId): if not CacheHolder.getCache().hasKey(jobId, JOBS): return None else: return CacheHolder.getCache().get(jobId, JOBS)