def __init__(self): LoggerMixin.configure() db.configure() Compute.configure() APIContainer.configure() JBoxAsyncJob.configure() JBoxAsyncJob.init(JBoxAsyncJob.MODE_PUB) self.application = tornado.web.Application(handlers=[ (r"^/", APIInfoHandler), (r"^/.*/.*", APIHandler) ]) self.application.settings["cookie_secret"] = JBoxCfg.get('sesskey') self.application.listen(JBoxCfg.get('api.manager_port'), address=socket.gethostname()) self.application.listen(JBoxCfg.get('api.manager_port'), address='localhost') self.ioloop = ioloop.IOLoop.instance() # run container maintainence every 5 minutes run_interval = 5 * 60 * 1000 self.log_info("Container maintenance every " + str(run_interval / (60 * 1000)) + " minutes") self.ct = ioloop.PeriodicCallback(JBoxAPI.do_housekeeping, run_interval, self.ioloop) self.sigct = ioloop.PeriodicCallback(JBoxAPI.do_signals, 1000, self.ioloop)
def update_juliabox_status(): in_error = len( JBoxInstanceProps.get_stale_instances(Compute.get_install_id())) instance_status = JBoxInstanceProps.get_instance_status( Compute.get_install_id()) HTML = "<html><body><center><pre>\nJuliaBox is Up.\n\nLast updated: " + datetime.datetime.now( ).isoformat() + " UTC\n\nLoads: " for iid in instance_status: HTML += (str(instance_status[iid]['load']) + '% ') HTML += ( "\n\nErrors: " + str(in_error) + "\n\nAWS Status: <a href='http://status.aws.amazon.com/'>status.aws.amazon.com</a></pre></center></body></html>" ) plugin = JBPluginCloud.jbox_get_plugin(JBPluginCloud.JBP_BUCKETSTORE) bkt = JBoxCfg.get("cloud_host.status_bucket") if plugin is not None and bkt is not None: try: f = open("/tmp/index.html", "w") f.write(HTML) f.close() plugin.push(bkt, "/tmp/index.html") finally: os.remove("/tmp/index.html") else: JBox.log_debug("Status: %s", HTML) return None
def sendrecv(self, cmd, data, dest=None, port=None): if (dest is None) or (dest == 'localhost'): dest = Compute.get_instance_local_ip() else: dest = Compute.get_instance_local_ip(dest) if port is None: port = self._rrport rraddr = 'tcp://%s:%d' % (dest, port) JBoxAsyncJob.log_debug("sendrecv to %s. connecting...", rraddr) sock = self._ctx.socket(zmq.REQ) sock.setsockopt(zmq.LINGER, 5*1000) sock.connect(rraddr) poller = zmq.Poller() poller.register(sock, zmq.POLLOUT) if poller.poll(10*1000): sock.send_json(self._make_msg(cmd, data)) else: sock.close() raise IOError("could not connect to %s", rraddr) poller.modify(sock, zmq.POLLIN) if poller.poll(10*1000): msg = sock.recv_json() else: sock.close() raise IOError("did not receive anything from %s", rraddr) JBoxAsyncJob.log_debug("sendrecv to %s. received.", rraddr) sock.close() return msg
def __init__(self): LoggerMixin.configure() db.configure() Compute.configure() APIContainer.configure() JBoxAsyncJob.configure() JBoxAsyncJob.init(JBoxAsyncJob.MODE_PUB) self.application = tornado.web.Application( handlers=[(r"^/", APIInfoHandler), (r"^/.*/.*", APIHandler)]) self.application.settings["cookie_secret"] = JBoxCfg.get('sesskey') self.application.listen(JBoxCfg.get('api.manager_port'), address=socket.gethostname()) self.application.listen(JBoxCfg.get('api.manager_port'), address='localhost') self.ioloop = ioloop.IOLoop.instance() # run container maintainence every 5 minutes run_interval = 5 * 60 * 1000 self.log_info("Container maintenance every " + str(run_interval / (60 * 1000)) + " minutes") self.ct = ioloop.PeriodicCallback(JBoxAPI.do_housekeeping, run_interval, self.ioloop) self.sigct = ioloop.PeriodicCallback(JBoxAPI.do_signals, 1000, self.ioloop)
def backup_and_cleanup(dockid): cont = SessContainer(dockid) cont.stop() cont.delete(backup=True) JBoxSessionProps.detach_instance(Compute.get_install_id(), cont.get_name(), Compute.get_instance_id()) JBoxd.publish_perf_counters() JBoxd.publish_anticipated_load()
def __init__(self): LoggerMixin.configure() db.configure() Compute.configure() SessContainer.configure() VolMgr.configure() JBoxAsyncJob.configure() JBoxAsyncJob.init(JBoxAsyncJob.MODE_PUB) self.application = tornado.web.Application(handlers=[ (r"/", MainHandler), (r"/jboxadmin/", AdminHandler), (r"/jboxping/", PingHandler), (r"/jboxcors/", CorsHandler) ]) JBPluginHandler.add_plugin_handlers(self.application) JBPluginUI.create_include_files() # cookie_secret = ''.join(random.choice(string.ascii_uppercase + string.digits) for x in xrange(32)) # use sesskey as cookie secret to be able to span multiple tornado servers self.application.settings["cookie_secret"] = JBoxCfg.get('sesskey') self.application.settings["plugin_features"] = JBox.get_pluggedin_features() self.application.listen(JBoxCfg.get('interactive.manager_port'), address=socket.gethostname()) self.application.listen(JBoxCfg.get('interactive.manager_port'), address='localhost') self.ioloop = tornado.ioloop.IOLoop.instance() # run container maintainence every 5 minutes run_interval = 5 * 60 * 1000 self.log_info("Container maintenance every " + str(run_interval / (60 * 1000)) + " minutes") self.ct = tornado.ioloop.PeriodicCallback(JBox.do_housekeeping, run_interval, self.ioloop) self.sigct = tornado.ioloop.PeriodicCallback(JBox.do_signals, 1000, self.ioloop)
def run(self): Compute.deregister_instance_dns() Compute.register_instance_dns() JBoxd.publish_perf_counters() JBoxd.log_debug("Setting up signal handlers") signal.signal(signal.SIGINT, JBoxd.signal_handler) signal.signal(signal.SIGTERM, JBoxd.signal_handler) if VolMgr.has_update_for_user_home_image(): VolMgr.update_user_home_image(fetch=False) while True: self.log_debug("JBox daemon waiting for commands...") try: offline, reply_req = JBoxd.QUEUE.poll(self._is_scheduled(JBoxAsyncJob.CMD_REQ_RESP, ())) except ValueError: self.log_exception("Exception reading command. Will retry after 10 seconds") time.sleep(10) continue if offline: try: self.process_offline() except: self.log_exception("Exception scheduling request") if reply_req: JBoxd.schedule_thread(JBoxAsyncJob.CMD_REQ_RESP, JBoxd.process_and_respond, ())
def publish_sessions(): iid = Compute.get_instance_id() for c in SessContainer.session_containers(allcontainers=True): if ("Names" in c) and (c["Names"] is not None): sessname = SessContainer(c["Id"]).get_name() if sessname: JBoxSessionProps.attach_instance(Compute.get_install_id(), sessname, iid, c["Status"])
def publish_sessions(): iid = Compute.get_instance_id() for c in SessContainer.session_containers(allcontainers=True): if ('Names' in c) and (c['Names'] is not None): sessname = SessContainer(c['Id']).get_name() if sessname: JBoxSessionProps.attach_instance(Compute.get_install_id(), sessname, iid, c["Status"])
def get_api_status(): api_status = dict() for c in BaseContainer.api_containers(allcontainers=True): name = c["Names"][0] if (("Names" in c) and (c["Names"] is not None)) else c["Id"][0:12] api_name = APIContainer.get_api_name_from_container_name(name) if api_name is None: continue cnt = api_status.get(api_name, 0) api_status[api_name] = cnt + 1 self_load = Compute.get_instance_stats(Compute.get_instance_id(), 'Load') accept = Compute.should_accept_session(is_proposed_cluster_leader()) return {'load': self_load, 'accept': accept, 'api_status': api_status}
def publish_instance_state(): iid = Compute.get_instance_id() api_status = dict() for c in BaseContainer.api_containers(allcontainers=True): name = c["Names"][0] if (("Names" in c) and (c["Names"] is not None)) else c["Id"][0:12] api_name = APIContainer.get_api_name_from_container_name(name) if api_name is None: continue cnt = api_status.get(api_name, 0) api_status[api_name] = cnt + 1 self_load = Compute.get_instance_stats(iid, 'Load') accept = Compute.should_accept_session(is_proposed_cluster_leader()) JBoxInstanceProps.set_props(Compute.get_install_id(), iid, load=self_load, accept=accept, api_status=api_status)
def __init__(self, ports, mode): self._mode = mode self._ctx = zmq.Context() ppmode = zmq.PUSH if (mode == JBoxAsyncJob.MODE_PUB) else zmq.PULL self._push_pull_sock = self._ctx.socket(ppmode) rrmode = zmq.REQ if (mode == JBoxAsyncJob.MODE_PUB) else zmq.REP local_ip = Compute.get_instance_local_ip() JBoxAsyncJob.log_debug("local hostname [%s]", local_ip) ppbindaddr = 'tcp://%s:%d' % (local_ip, ports[0],) ppconnaddr = 'tcp://%s:%d' % (local_ip, ports[0],) rraddr = 'tcp://%s:%d' % (local_ip, ports[1],) self._rrport = ports[1] self._poller = zmq.Poller() if mode == JBoxAsyncJob.MODE_PUB: self._push_pull_sock.connect(ppconnaddr) else: self._push_pull_sock.bind(ppbindaddr) self._poller.register(self._push_pull_sock, zmq.POLLIN) self._req_rep_sock = self._ctx.socket(rrmode) self._req_rep_sock.bind(rraddr)
def update_juliabox_status(): instances = Compute.get_all_instances() in_error = 0 HTML = "<html><body><center><pre>\nJuliaBox is Up.\n\nLast updated: " + datetime.datetime.now().isoformat() + " UTC\n\nLoads: " for inst in instances: try: status = JBoxAsyncJob.sync_api_status(inst)['data'] HTML += (str(status['load']) + '% ') except: in_error += 1 pass HTML += ("\n\nErrors: " + str(in_error) + "\n\nAWS Status: <a href='http://status.aws.amazon.com/'>status.aws.amazon.com</a></pre></center></body></html>") plugin = JBPluginCloud.jbox_get_plugin(JBPluginCloud.JBP_BUCKETSTORE) bkt = JBoxCfg.get("cloud_host.status_bucket") if plugin is not None and bkt is not None: try: f = open("/tmp/index.html", "w") f.write(HTML) f.close() plugin.push(bkt, "/tmp/index.html") finally: os.remove("/tmp/index.html") else: JBox.log_debug("Status: %s", HTML) return None
def is_terminating(): if not JBoxCfg.get('cloud_host.scale_down'): return False num_active = BaseContainer.num_active() terminate = (num_active == 0) and Compute.can_terminate(is_proposed_cluster_leader()) if terminate: JBoxd.log_warn("terminating to scale down") try: Compute.deregister_instance_dns() except: JBoxd.log_error("Error deregistering instance dns") Compute.terminate_instance() return terminate
def do_housekeeping(): terminating = False server_delete_timeout = JBoxCfg.get('interactive.expire') inactive_timeout = JBoxCfg.get('interactive.inactivity_timeout') SessContainer.maintain(max_timeout=server_delete_timeout, inactive_timeout=inactive_timeout) is_leader = is_cluster_leader() if is_leader: terminating = False else: try: terminating = JBoxAsyncJob.sync_is_terminating() if terminating['code'] == 0: terminating = terminating['data'] else: JBox.log_error("Error checking if instance is terminating. Assuming False.") terminating = False except: JBox.log_error("Exception checking if instance is terminating. Assuming False.") terminating = False if is_leader: JBox.log_info("I am the cluster leader") JBox.update_juliabox_status() JBox.monitor_registrations() if not JBoxDynConfig.is_stat_collected_within(Compute.get_install_id(), 1): JBoxAsyncJob.async_collect_stats() if terminating: JBox.log_warn("terminating to scale down") else: JBox.do_update_user_home_image() JBoxAsyncJob.async_plugin_maintenance(is_leader)
def monitor_registrations(): max_rate = JBoxDynConfig.get_registration_hourly_rate(Compute.get_install_id()) rate = JBoxUserV2.count_created(1) reg_allowed = JBoxDynConfig.get_allow_registration(Compute.get_install_id()) JBox.log_debug("registration allowed: %r, rate: %d, max allowed: %d", reg_allowed, rate, max_rate) if (reg_allowed and (rate > max_rate*1.1)) or ((not reg_allowed) and (rate < max_rate*0.9)): reg_allowed = not reg_allowed JBox.log_warn("Changing registration allowed to %r", reg_allowed) JBoxDynConfig.set_allow_registration(Compute.get_install_id(), reg_allowed) if reg_allowed: num_pending_activations = JBoxUserV2.count_pending_activations() if num_pending_activations > 0: JBox.log_info("scheduling activations for %d pending activations", num_pending_activations) JBoxAsyncJob.async_schedule_activations()
def publish_perf_counters(): """ Publish performance counters. Used for status monitoring and auto scaling. """ VolMgr.refresh_disk_use_status() nactive = BaseContainer.num_active(BaseContainer.SFX_INT) stats = [] stats.append(("NumActiveContainers", "Count", nactive)) nactive_api = BaseContainer.num_active(BaseContainer.SFX_API) stats.append(("NumActiveAPIContainers", "Count", nactive_api)) curr_cpu_used_pct = psutil.cpu_percent() last_cpu_used_pct = curr_cpu_used_pct if BaseContainer.LAST_CPU_PCT is None else BaseContainer.LAST_CPU_PCT BaseContainer.LAST_CPU_PCT = curr_cpu_used_pct cpu_used_pct = int((curr_cpu_used_pct + last_cpu_used_pct) / 2) stats.append(("CPUUsed", "Percent", cpu_used_pct)) mem_used_pct = psutil.virtual_memory().percent stats.append(("MemUsed", "Percent", mem_used_pct)) disk_used_pct = 0 for x in psutil.disk_partitions(): if not VolMgr.is_mount_path(x.mountpoint): try: disk_used_pct = max(psutil.disk_usage(x.mountpoint).percent, disk_used_pct) except: pass if BaseContainer.INITIAL_DISK_USED_PCT is None: BaseContainer.INITIAL_DISK_USED_PCT = disk_used_pct disk_used_pct = max(0, (disk_used_pct - BaseContainer.INITIAL_DISK_USED_PCT)) stats.append(("DiskUsed", "Percent", disk_used_pct)) cont_load_pct = min(100, max(0, nactive * 100 / SessContainer.MAX_CONTAINERS)) stats.append(("ContainersUsed", "Percent", cont_load_pct)) api_cont_load_pct = min(100, max(0, nactive_api * 100 / APIContainer.MAX_CONTAINERS)) stats.append(("APIContainersUsed", "Percent", api_cont_load_pct)) stats.append(("DiskIdsUsed", "Percent", VolMgr.used_pct())) overall_load_pct = max( cont_load_pct, api_cont_load_pct, disk_used_pct, mem_used_pct, cpu_used_pct, VolMgr.used_pct() ) stats.append(("Load", "Percent", overall_load_pct)) Compute.publish_stats_multi(stats)
def __init__(self): LoggerMixin.configure() db.configure() Compute.configure() SessContainer.configure() APIContainer.configure() VolMgr.configure() JBoxAsyncJob.configure() JBoxAsyncJob.init(JBoxAsyncJob.MODE_SUB) self.log_debug("Container manager listening on ports: %s", repr(JBoxCfg.get('container_manager_ports'))) JBoxd.QUEUE = JBoxAsyncJob.get() JBoxd.MAX_ACTIVATIONS_PER_SEC = JBoxCfg.get('user_activation.max_activations_per_sec') JBoxd.MAX_AUTO_ACTIVATIONS_PER_RUN = JBoxCfg.get('user_activation.max_activations_per_run') JBoxd.ACTIVATION_SUBJECT = JBoxCfg.get('user_activation.mail_subject') JBoxd.ACTIVATION_BODY = JBoxCfg.get('user_activation.mail_body') JBoxd.ACTIVATION_SENDER = JBoxCfg.get('user_activation.sender')
def publish_perf_counters(): """ Publish performance counters. Used for status monitoring and auto scaling. """ VolMgr.refresh_disk_use_status() nactive = BaseContainer.num_active(BaseContainer.SFX_INT) stats = [] stats.append(("NumActiveContainers", "Count", nactive)) nactive_api = BaseContainer.num_active(BaseContainer.SFX_API) stats.append(("NumActiveAPIContainers", "Count", nactive_api)) curr_cpu_used_pct = psutil.cpu_percent() last_cpu_used_pct = curr_cpu_used_pct if BaseContainer.LAST_CPU_PCT is None else BaseContainer.LAST_CPU_PCT BaseContainer.LAST_CPU_PCT = curr_cpu_used_pct cpu_used_pct = int((curr_cpu_used_pct + last_cpu_used_pct)/2) stats.append(("CPUUsed", "Percent", cpu_used_pct)) mem_used_pct = psutil.virtual_memory().percent stats.append(("MemUsed", "Percent", mem_used_pct)) disk_used_pct = 0 for x in psutil.disk_partitions(): if not VolMgr.is_mount_path(x.mountpoint): try: disk_used_pct = max(psutil.disk_usage(x.mountpoint).percent, disk_used_pct) except: pass if BaseContainer.INITIAL_DISK_USED_PCT is None: BaseContainer.INITIAL_DISK_USED_PCT = disk_used_pct disk_used_pct = max(0, (disk_used_pct - BaseContainer.INITIAL_DISK_USED_PCT)) stats.append(("DiskUsed", "Percent", disk_used_pct)) cont_load_pct = min(100, max(0, nactive * 100 / SessContainer.MAX_CONTAINERS)) stats.append(("ContainersUsed", "Percent", cont_load_pct)) api_cont_load_pct = min(100, max(0, nactive_api * 100 / APIContainer.MAX_CONTAINERS)) stats.append(("APIContainersUsed", "Percent", api_cont_load_pct)) stats.append(("DiskIdsUsed", "Percent", VolMgr.used_pct())) overall_load_pct = max(cont_load_pct, api_cont_load_pct, disk_used_pct, mem_used_pct, cpu_used_pct, VolMgr.used_pct()) stats.append(("Load", "Percent", overall_load_pct)) Compute.publish_stats_multi(stats)
def __init__(self): LoggerMixin.configure() db.configure() Compute.configure() SessContainer.configure() VolMgr.configure() JBoxAsyncJob.configure() JBoxAsyncJob.init(JBoxAsyncJob.MODE_PUB) self.application = tornado.web.Application( handlers=[(r"/", MainHandler), ( r"/jboxadmin/", AdminHandler), (r"/jboxping/", PingHandler), (r"/jboxcors/", CorsHandler)]) JBPluginHandler.add_plugin_handlers(self.application) JBPluginUI.create_include_files() # cookie_secret = ''.join(random.choice(string.ascii_uppercase + string.digits) for x in xrange(32)) # use sesskey as cookie secret to be able to span multiple tornado servers self.application.settings["cookie_secret"] = JBoxCfg.get('sesskey') self.application.settings[ "plugin_features"] = JBox.get_pluggedin_features() self.application.listen(JBoxCfg.get('interactive.manager_port'), address=socket.gethostname()) self.application.listen(JBoxCfg.get('interactive.manager_port'), address='localhost') self.ioloop = tornado.ioloop.IOLoop.instance() # run container maintainence every 5 minutes run_interval = 5 * 60 * 1000 self.log_info("Container maintenance every " + str(run_interval / (60 * 1000)) + " minutes") self.ct = tornado.ioloop.PeriodicCallback(JBox.do_housekeeping, run_interval, self.ioloop) self.sigct = tornado.ioloop.PeriodicCallback(JBox.do_signals, 1000, self.ioloop) # or configure cacerts AsyncHTTPClient.configure(None, defaults=dict(validate_cert=None))
def schedule_housekeeping(cmd, is_leader): JBoxd.publish_perf_counters() JBoxd.publish_sessions() JBoxd.publish_instance_state() features = [JBPluginTask.JBP_NODE] if is_leader is True: JBoxInstanceProps.purge_stale_instances(Compute.get_install_id()) features.append(JBPluginTask.JBP_CLUSTER) for feature in features: for plugin in JBPluginTask.jbox_get_plugins(feature): JBoxd.schedule_thread(cmd, plugin.do_periodic_task, (feature,))
def publish_anticipated_load(session_name=None): iid = Compute.get_instance_id() if session_name is None: nactive = BaseContainer.num_active(BaseContainer.SFX_INT) else: JBoxSessionProps.attach_instance(Compute.get_install_id(), session_name, iid, "Preparing") nactive = BaseContainer.num_active(BaseContainer.SFX_INT) + 1 cont_load_pct = min(100, max(0, nactive * 100 / SessContainer.MAX_CONTAINERS)) self_load = max(Compute.get_instance_stats(iid, 'Load'), cont_load_pct) Compute.publish_stats("Load", "Percent", self_load) accept = Compute.should_accept_session(is_proposed_cluster_leader()) JBoxInstanceProps.set_props(Compute.get_install_id(), iid, load=self_load, accept=accept)
def monitor_registrations(): max_rate = JBoxDynConfig.get_registration_hourly_rate( Compute.get_install_id()) rate = JBoxUserV2.count_created(1) reg_allowed = JBoxDynConfig.get_allow_registration( Compute.get_install_id()) JBox.log_debug("registration allowed: %r, rate: %d, max allowed: %d", reg_allowed, rate, max_rate) if (reg_allowed and (rate > max_rate * 1.1)) or ((not reg_allowed) and (rate < max_rate * 0.9)): reg_allowed = not reg_allowed JBox.log_warn("Changing registration allowed to %r", reg_allowed) JBoxDynConfig.set_allow_registration(Compute.get_install_id(), reg_allowed) if reg_allowed: num_pending_activations = JBoxUserV2.count_pending_activations() if num_pending_activations > 0: JBox.log_info( "scheduling activations for %d pending activations", num_pending_activations) JBoxAsyncJob.async_schedule_activations()
def do_housekeeping(): terminating = False server_delete_timeout = JBoxCfg.get('interactive.expire') inactive_timeout = JBoxCfg.get('interactive.inactivity_timeout') SessContainer.maintain(max_timeout=server_delete_timeout, inactive_timeout=inactive_timeout) is_leader = is_cluster_leader() if is_leader: terminating = False else: try: terminating = JBoxAsyncJob.sync_is_terminating() if terminating['code'] == 0: terminating = terminating['data'] else: JBox.log_error( "Error checking if instance is terminating. Assuming False." ) terminating = False except: JBox.log_error( "Exception checking if instance is terminating. Assuming False." ) terminating = False if is_leader: JBox.log_info("I am the cluster leader") JBox.update_juliabox_status() JBox.monitor_registrations() if not JBoxDynConfig.is_stat_collected_within( Compute.get_install_id(), 1): JBoxAsyncJob.async_collect_stats() if terminating: JBox.log_warn("terminating to scale down") else: JBox.do_update_user_home_image() JBoxAsyncJob.async_plugin_maintenance(is_leader)
def publish_container_stats(): VolMgr.publish_stats() db.publish_stats() JBoxDynConfig.set_stat_collected_date(Compute.get_install_id())
def publish_sessions(): iid = Compute.get_instance_id() for c in SessContainer.session_containers(allcontainers=True): if ('Names' in c) and (c['Names'] is not None): JBoxSessionProps.attach_instance(Compute.get_install_id(), SessContainer(c['Id']).get_name(), iid, c["Status"])