def __init__(self, hosts, nodeInfo, eventCallback): parsed = urlparse(hosts) print hosts if parsed.scheme != 'zk': raise ValueError("Meta URL must start with zk://") if parsed.path in ('/', ''): raise ValueError("Service root path not found.") self.rootpath = parsed.path.rstrip('/') + '/oceanworker' log.error("ROOT = %s" % self.rootpath) # NOTE: currently, auth_data is not supported servers = parsed.netloc.split('@')[-1] self.masterLock = None self.callback = eventCallback self.client = KazooClient(servers) self.client.add_listener(self.__connectionState) self.lastEventUpdated = 0 self.bRun = False self.nodeInfo = nodeInfo self.party = self.client.ShallowParty(self.rootpath + '/workers', identifier=settings.UUID.get_hex()) self.node = self.party.node self.state = self.client.state self.devTasks = {} self.client.start()
def setNodeInfo(self, info): self.nodeInfo = info try: self.client.set('%s/%s' % (self.party.path, self.node), json.dumps(info)) self.client.set('%s' % (self.party.path), json.dumps({'worker': self.node, 'lastUpdate': time.time()})) except Exception, e: log.error(e)
def feeder(self, action="status"): """ start/stop henem message feedfer http://url/henem/control/feeder?action=<command> :param action: command (start, stop, status) :return: result message """ try: cmd = str(action).lower() if cmd not in ['start', 'stop', 'status']: return "NOT OK" if cmd == "stop": for feeder in self.henem.feeders: feeder.command("pause") elif cmd == "start": for feeder in self.henem.feeders: feeder.command("resume") elif cmd == "status": msg = "" for feeder in self.henem.feeders: for feeder in self.henem.feeders: msg += "message feeding" if not feeder.pause else "feeding stopped\n" return msg return "message feeding %s" % cmd except Exception, e: log.error("RESTAPI - feeder : %s" % e) raise cherrypy.HTTPError(500)
def __updateWorkerTasks(self, tasks): newTaskSlots = [] #currentSlotTask = self.slotPoolgetCurrentSlots() for task in tasks: #print "TASK!!!", task, newTaskSlots # TODO : Check module version and task name ret = filter(lambda x: x.taskName == task, self.taskSlots) if len(ret) > 0: self.taskSlots.remove(ret[0]) newTaskSlots.append(ret[0]) else: moduleName = self.taskList.get(task) if moduleName == None: log.error("Task not found.. - %s" %self.taskList) continue if moduleName not in sys.modules: module = importlib.import_module(moduleName) else: module = sys.modules[moduleName] handler = getattr(module, task) slot = self.slotPool.requestSlot(task, handler) newTaskSlots.append(slot) map(lambda x: self.slotPool.releaseSlot(x), self.taskSlots) self.taskSlots = newTaskSlots
def onMessage(self, body, message): if self.callback: try: self.callback(body) message.ack() except Exception, e: log.error(e)
def __loadDevTask(self): info = self.metaClient.getNodeInfo() info['devTasks'] = [] info['environment'] = self.environment for name in os.listdir(os.path.join(os.getcwd(), DIR_DEV_MODULE)): if not os.path.isdir(os.path.join(os.getcwd(), DIR_DEV_MODULE, name)): continue try: module = importlib.import_module(name) for e in dir(module): attr = getattr(module, e) if hasattr(attr, 'OW_TASK_SUBSCRIBE_EVENTS'): log.info("Load Task - %s" % attr.__name__) retryCnt = 3 while retryCnt > 0: """ start slots after request complete. because, AMQP communication affects zookeeper communication TODO: solve this problem. """ slot = self.slotPool.requestSlot('%s_dev' % attr.__name__, attr, auto_start=False) if slot != None: info['devTasks'].append(slot.taskName) self.metaClient.addDevTask(slot.taskName, slot.task) self.devTaskSlots.append(slot) break else: retryCnt -= 1 gevent.sleep(1) except ImportError, e: log.error("Import ERROR - [%s] %s" % (name, e))
def sendNodeEvent(self, node, event): try: log.info("Set node info - %s, %s" % (node, event['tasks'])) self.client.set('%s/event/worker/%s' % (self.rootpath, node), json.dumps(event)) except Exception, e: log.error(e) return False
def allocDictCursor(name): db = __getMySQLdb__(name) if not db: log.error("Cannot find DB connection info - %s" % name) return None return db.cursor(MySQLdb.cursors.DictCursor)
def __checkMongo(self): try: mongoqueue = self.db['APATLogQueue'].count() log.debug("QUEUE - %d" % mongoqueue) except pymongo.errors.AutoReconnect: pass except Exception, e: log.error(e)
def getNodes(self): workerInfo = {} try: for worker in self.client.get_children(self.rootpath + '/workers'): info = self.client.get('%s/workers/%s' % (self.rootpath, worker)) workerInfo[worker] = json.loads(info[0]) except Exception, e: log.error("Worker Load error - worker : %s" % e)
def getNodeInfo(self, node=None): if not node: node = self.node try: return json.loads(self.client.get('%s/%s' % (self.party.path, node))[0]) except Exception, e: log.error(e) return None
def __reconnect(self): gevent.sleep(1) try: self.metaClient.restart() except (kazoo.exceptions.SessionExpiredError, kazoo.exceptions.ConnectionLoss, kazoo.exceptions.ConnectionClosedError) as detail: gevent.spawn(self.__reconnect) except Exception, e: log.error(e) gevent.spawn(self.__reconnect)
def updateSubscribeEvents(self, info): try: exchangeInfo = dict() for (event, hashKey) in info.items(): exchangeInfo[event] = dict(exchange="job.general", routing=event, hashKey=hashKey) self.exchangeInfo = exchangeInfo except Exception, e: log.error(e)
def start(self): log.info("Start Worker [%s] - %s" % (id(self), time.time())) self.bLoop = True self.metaClient.open() if self.environment != 'production': try: self.__loadDevTask() except Exception ,e: log.error(e)
def _run(self): log.info("start maintenance thread..") # connecting zookeeper try: self.metaClient.start() self.__checkMetaInfo() except Exception, e: log.error(e) raise e
def __eventWorker(self, data, event=None): try: if isinstance(data, (list, tuple)): log.info("NODES : %d" % len(data)) self.__updateWorkerStatus() except (kazoo.exceptions.SessionExpiredError, kazoo.exceptions.ConnectionLoss, kazoo.exceptions.ConnectionClosedError) as detail: gevent.spawn(self.__reconnect) except Exception, e: log.error(e)
def AMQPConnectionFactory(url): global _process_connections if url not in _process_connections: log.error("Create Connection - %s" % url) #conn = _process_connections[url] = connections[Connection(url)].acquire(block=True) _process_connections[url] = Connection(url) _process_connections[url].connect() print dir(_process_connections[url]) return _process_connections[url]
def __connectionState(self, state): log.error("Zookeeper Connection -> %s - %s" % (state, self.node)) if state == KazooState.SUSPENDED: pass if state == KazooState.LOST: pass if state == KazooState.CONNECTED: # if connected, setup worker node gevent.spawn(self.__setupNode) self.state = state
def command(self, cmd): if cmd is None or len(cmd) == 0: return if cmd == "pause": self.pause = True log.info("Pause MessageFeeder") elif cmd == "resume": self.pause = False log.info("Resume MessageFeeder") else: log.error("UNKNOWN COMMAND : %s" % cmd)
def _run(self): self.threadId = threading.currentThread().getName() log.info("Start task slot - %s (%s)" % (self.taskName, self.threadId)) self.running = True if self.amqpProducer: # Non-blocking self.amqpProducer.start() if self.amqpConsumer: try: # blocking and consume messages self.amqpConsumer.start() except KeyboardInterrupt: log.error("CATCH KEYBOARD INTERRUPT in SLOT LOOP!!!!!") log.info("Exit task slot - %s (%s)" % (self.taskName, self.threadId))
def onProducerCallback(self): while True: try: event, params = self.publishQueue.get(block=False) except Queue.Empty, e: sleep(0.1) return self.publishQueue.task_done() # non consistent-hashing only. try: self.amqpProducer.publish('job.general', event, params) pass except Exception, e: log.error("Publish Error - event: %s, params: %s" % (event, params)) return
def get_consumers(self, Consumer, channel): log.info('get_consumers') consumerQueues = [] # if ex == 'consistent-hashing' : bind 'job.general' and ex with route key # if ex != 'consistent-hashing' : bind ex and q with route key try: for exchange in self.exchanges: ex = exchange['exchange'](channel) ex.declare() if exchange['exchange'].type == 'x-consistent-hash': ex.bind_to('job.general', exchange['key']) # TODO: update! # setup queue q = exchange['queue'] consumerQueues.append(q) except Exception, e: log.error(e)
def publish(self, exchange, key, data, arguments=None): if not self.connection.connected: return False try: with producers[self.connection].acquire(block=True) as producer: ex = Exchange(exchange, type="direct") self.connection.ensure(producer, producer.publish, errback=self.onError, max_retries=10, interval_max=3) producer.publish( data, routing_key=key, delivery_mode=2, compression="bzip2", headers=arguments, exchange=ex, retry=True, ) return True except Exception, e: log.error(e)
def __getMongodb__(name): def connect(name): global __dbConfigParams if not __dbConfigParams.has_key(name): raise ValueError("invalid db connection name - %s" % name) url = __dbConfigParams[name] client = pymongo.MongoClient(url) return client conn = getattr(__dbConnections, name, None) if not conn: try: conn = connect(name) except Exception, e: log.error("MongoDB Connect Error - %s" % e) return None setattr(__dbConnections, name, conn)
def __eventTask(self, data, event): try: (nodes, stat) = self.metaClient.get_children(self.rootpath + '/oceanworker/tasks', include_data=True) self.taskDB.version = stat.version self.taskDB.cversion = stat.cversion self.taskDB.tasks = {} for node in nodes: (data, stat) = self.metaClient.get(self.rootpath + '/oceanworker/tasks/%s' % node) znode = ZKNode(stat.version, stat.cversion) znode.data = data if znode.data.get('maxWorker', 0) > 0: self.taskDB.members[node] = znode for feeder in self.feeders: feeder.updateSubscribeEvents(self.getEventSubscribeStatus()) except (kazoo.exceptions.SessionExpiredError, kazoo.exceptions.ConnectionLoss, kazoo.exceptions.ConnectionClosedError) as detail: gevent.spawn(self._reconnect) except Exception, e: log.error(e)
def onConsumerCallback(self, params): if not self.running: return False if self.reloading: log.info("Replace task to updated.") self.task = self.updatedTask self.updatedTask = None self.reloading = False if params == None or len(params) == 0: log.error("No task params for '%s' (%s)" % (self.taskName, self.threadId)) try: start = time.time() self.task.handler(params) runTime = time.time() - start self.statSumProcessCount += 1 self.statSumProcessTime = runTime if self.statNextPeriod <= time.time(): log.debug("%s - %d messages. Avg. %f sec" % (self.taskName, self.statSumProcessCount, 1.0 * self.statSumProcessTime / self.statSumProcessCount)) self.statSumProcessCount = 0 self.statSumProcessTime = 0 self.statNextPeriod = time.time() + 30 except Exception, e: log.error("Task Exception - %s (%s) - %s" % (self.taskName, e, params)) worker.sentry.client.captureException(data=params) exc_type, exc_value, exc_traceback = sys.exc_info() log.error(traceback.format_exception(exc_type, exc_value, exc_traceback)) return False
def get_consumers(self, Consumer, channel): log.info("get_consumers") consumerQueues = [] # print len(self.exchanges) # print dir(channel) # print self.connection, channel.connection # if ex == 'consistent-hashing' : bind 'job.general' and ex with route key # if ex != 'consistent-hashing' : bind ex and q with route key try: for exchange in self.exchanges: ex = exchange["exchange"](channel) ex.declare() if exchange["exchange"].type == "x-consistent-hash": ex.bind_to("job.general", exchange["key"]) # TODO: update! # setup queue q = exchange["queue"] consumerQueues.append(q) except Exception, e: log.error(e)
def __reconnect(self): gevent.sleep(1) try: if not self.client.exists(self.rootpath + '/worker/' + self.node): log.error("Reconnect - %s" % self.node) if len(self.devTasks) > 0: for path, info in self.devTasks.iteritems(): self.client.create(path, json.dumps(info), ephemeral=True) self.__setupNode() except (kazoo.exceptions.SessionExpiredError, kazoo.exceptions.ConnectionLoss, kazoo.exceptions.ConnectionClosedError) as detail: log.error("Zookeeper : Unrecoverable Error - %s" % detail) # unrecoverable Error start = time.time() gevent.spawn(self.retryObj, self.client.restart) log.error("RECONNECT TIME - %s" % (time.time() - start)) except Exception, e: # Try again log.error(e) exc_info = sys.exc_info() log.error(''.join(traceback.format_exception(*exc_info))) gevent.spawn(self.__reconnect) pass
def __updateWorkerStatus(self): try: (nodes, stat) = self.metaClient.get_children(self.rootpath + '/oceanworker/workers', include_data=True) if self.workerDB.version == stat.version and self.workerDB.cversion == stat.cversion: return self.workerDB.version = stat.version self.workerDB.cversion = stat.cversion self.workerDB.members = {} for node in nodes: try: (data, stat) = self.metaClient.get(self.rootpath + '/oceanworker/workers/%s' % node) znode = ZKNode(stat.version, stat.cversion) znode.data = data self.workerDB.members[node] = znode except Exception, e: log.error(e) continue for feeder in self.feeders: feeder.updateWorkerStatus(self.getTaskAssignStatus())
def __setupNode(self): try: if self.client.exists(os.path.join(self.party.path, self.party.node)): return self.party.data = json.dumps(self.nodeInfo) self.party.join() # create event node for worker nodepath = self.rootpath + '/event/worker/' + self.node log.error("Create worker node - %s" % nodepath) if not self.client.exists(nodepath): self.client.create(nodepath, makepath=True, ephemeral=True) # collect info try: eventInfo = json.loads(self.client.get(self.rootpath + '/event')[0]) if eventInfo: self.lastEventUpdated = eventInfo.get('update', time.time()) except ValueError, e: pass except Exception, e: log.error(e)