def probe(self): """Send out heartbeat, node states; do the regular maintenance work.""" t0 = time.time() while True: try: t1 = time.time() t = max(int(t1 - t0), 1) msg = {'op':'live', 'agid':self.id, 'args':self.stat.get_metric()} self.broadcast(msg) if t % 10 == 0: if len(self.agents) < 2: self.start_possible_agents() for agid, agtp in self.agents.items(): agip, agtm, _ = agtp if t1 - agtm > 60: self.agents.pop(agid) if self.id == max(self.agents.keys()): self.start_agent(agip) if t % 3600 == 0 and self.id == max(self.agents.keys()): self.start_possible_agents() self.print_summary() # Liang: new feature test if self.id == self.who_is_boss(): if self.jmgmt is None: self.jmgmt = JobMgmt() else: self.jmgmt.update_stat() else: self.jmgmt = None time.sleep(1) except Exception, err: print "Exception:Agent.probe():", err
class Agent(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.agents = {} self.id = random.randint(0, 65535) self.ip = get_myip() self.stat = NodeStat() self.exit_event = threading.Event() self.jobs = {} self.jobs_lock = threading.Lock() self.jmgmt = None t = threading.Thread(target=self.probe, args=()) t.daemon = True t.start() pass def broadcast(self, msg): msg['tid'] = random.randint(0, 65535) msg = dump_msg(msg) bsock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) bsock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) bsock.sendto(msg, ("<broadcast>", BPORT)) pass def get_app_path(self): app_path = os.path.realpath(__file__) return app_path def get_config_dir(self): config_dir = "%s/config" % os.path.dirname(self.get_app_path()) return config_dir def get_idle_nodes_origin(self): l = [] for k, v in self.agents.items(): agip, agts, agcpu = v bisect.insort_left(l,(agcpu,agip)) l.sort() return l def get_idle_nodes(self): """Liang: Only used in debug version, replace with original one in product version.""" if not hasattr(self, 'agent_in_use'): self.agent_in_use = set() l = [] for k, v in self.agents.items(): agip, agts, agcpu = v if agip in self.agent_in_use: continue else: bisect.insort_left(l,(agcpu,agip)) l.sort() return l def get_jobcontrol(self, jobid): """Get job control object from self.jobs""" jc = None self.jobs_lock.acquire() if jobid not in self.jobs.keys(): self.jobs[jobid] = JobControl(jobid) jc = self.jobs[jobid] self.jobs_lock.release() return jc def has_jobcontrol(self, jobid): """Test wheter the node agent has jobcontrol object given the jobid.""" has = False self.jobs_lock.acquire() if jobid in self.jobs.keys(): has = True self.jobs_lock.release() return has def print_summary(self): """Print out the summary information on the screen""" if self.agents: boss = self.who_is_boss() idlest = self.get_idle_nodes()[0] print("active:%i,\tboss:(%s, %i),\tidlest:(%s, %.2f)" % (len(self.agents), self.agents[boss][0], boss, idlest[1], idlest[0])) pass def probe(self): """Send out heartbeat, node states; do the regular maintenance work.""" t0 = time.time() while True: try: t1 = time.time() t = max(int(t1 - t0), 1) msg = {'op':'live', 'agid':self.id, 'args':self.stat.get_metric()} self.broadcast(msg) if t % 10 == 0: if len(self.agents) < 2: self.start_possible_agents() for agid, agtp in self.agents.items(): agip, agtm, _ = agtp if t1 - agtm > 60: self.agents.pop(agid) if self.id == max(self.agents.keys()): self.start_agent(agip) if t % 3600 == 0 and self.id == max(self.agents.keys()): self.start_possible_agents() self.print_summary() # Liang: new feature test if self.id == self.who_is_boss(): if self.jmgmt is None: self.jmgmt = JobMgmt() else: self.jmgmt.update_stat() else: self.jmgmt = None time.sleep(1) except Exception, err: print "Exception:Agent.probe():", err pass