def run(root_path): thread_manager, data_root, pool = setup() # Assert that we're running on Mesos 0.22.1 or above. try: version_url = "http://%s/state.json" % MESOS_MASTER_URL data = web_helpers.get_json_from_url(version_url) mesos_version = data["version"] except (web_helpers.GetJSONFromURLException, KeyError) as e: logger.error( "Error while trying to find version information for Mesos " "master %s: %s" % (MESOS_MASTER_URL, str(e))) sys.exit(1) try: mesos_version_tuple = tuple(int(x) for x in mesos_version.split('.')) if mesos_version_tuple < (0, 22, 1): logger.error( "Mesos master at %s is version %s, but we require at least " "Mesos version 0.22.1" % (MESOS_MASTER_URL, mesos_version)) sys.exit(1) except ValueError: # If we can't parse the version tuple, we ignore it. pass framework = mesos_pb2.FrameworkInfo() framework.user = SCHEDULER_USER framework.name = SCHEDULER_NAME framework.role = SCHEDULER_ROLE framework.failover_timeout = FAILOVER_TIMEOUT framework.checkpoint = True framework.webui_url = "http://%s:%d" % (EXTERNAL_WEB_HOST, EXTERNAL_WEB_PORT) if data_root.meta.data.framework_id is not None: framework.id.value = data_root.meta.data.framework_id credential = None principal = os.environ.get("DEFAULT_PRINCIPAL", None) if MESOS_AUTHENTICATE and principal: secret = os.environ.get("DEFAULT_SECRET", None) framework.principal = principal credential = mesos_pb2.Credential() credential.principal = principal if secret is not None: credential.secret = secret implicit_acknowledgments = 1 if os.getenv("MESOS_EXPLICIT_ACKNOWLEDGEMENTS"): implicit_acknowledgments = 0 scheduler = MemSQLScheduler(data_root) if credential is not None: driver = MesosSchedulerDriver(scheduler, framework, MESOS_MASTER_URL, implicit_acknowledgments, credential) else: driver = MesosSchedulerDriver(scheduler, framework, MESOS_MASTER_URL, implicit_acknowledgments) thread_manager.add(ClusterMonitor, { "driver": driver, "data_root": data_root }) thread_manager.add(WebServer, { "host": "0.0.0.0", "port": EXTERNAL_WEB_PORT, "root_path": root_path, "pool": pool }) def shutdown(*args): global SHUTTING_DOWN SHUTTING_DOWN = True logger.info("MemSQL scheduler is shutting down") thread_manager.close(block=True) scheduler.shuttingDown = True def stop_timeout(): wait_started = datetime.datetime.now() while scheduler.tasksRunning > 0 and SHUTDOWN_TIMEOUT > (datetime.datetime.now() - wait_started).total_seconds(): time.sleep(1) if scheduler.tasksRunning > 0: logger.warning("Shutdown by timeout, %d task(s) have not completed" % scheduler.tasksRunning) driver.stop(True) stop_thread = Thread(target=stop_timeout) stop_thread.start() signal.signal(signal.SIGINT, shutdown) signal.signal(signal.SIGTERM, shutdown) try: thread_manager.start(timeout=START_TIMEOUT) except Exception: logger.error("Failed to start MemSQL Agent.", exc_info=True) else: logger.info("Exit with Ctrl-C") result = [] # driver.run() blocks; we run it in a separate thread def run_driver_async(): status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1 driver.stop(True) result.append(status) framework_thread = Thread(target=run_driver_async, args=()) framework_thread.start() while framework_thread.is_alive(): time.sleep(1) sys.exit(result[0]) finally: if not SHUTTING_DOWN: shutdown()
def resourceOffers(self, driver, offers): logger.debug("Got %d offers" % len(offers)) clusters = self.root.clusters logger.debug("Current clusters: %s" % clusters.records) for offer in offers: didathing = False logger.debug("Got resource offer [%s]" % offer.id.value) work = None for cluster in clusters: if cluster.data.status != const.ClusterStatus.CREATING: continue work = cluster break if self.shuttingDown: logger.debug("Shutting down: declining offer on [%s]" % offer.hostname) driver.declineOffer(offer.id) continue elif work is None: logger.debug("Nothing to do, declining offer [%s]" % offer.id) driver.declineOffer(offer.id) continue cpus, mem, disk, ports = utils.get_resources(offer.resources, MEMSQL_SCHEDULER_ROLE) cluster = work try: data = web_helpers.get_json_from_url(CURRENT_AGENT_VERSION_URL) agent_version = data["version"] except (web_helpers.GetJSONFromURLException, KeyError) as e: logger.error( "Error while getting newest version of MemSQL Agent: %s. " "Rolling back cluster" % str(e)) self._rollback_cluster(cluster) continue cluster.save(agent_version=agent_version) cluster.maybe_create_nodes() flavor = cluster.flavor if flavor.bigger_than(cpus, mem / 1024, disk / 1024) or len(ports) < AGENT_TASK_PORTS: logger.info("Not enough resources for agent task: " "%s cpus, %s mem, %s disk, %s ports but " "need %s, %s ports" % ( cpus, mem / 1024, disk / 1024, len(ports), flavor, AGENT_TASK_PORTS)) driver.declineOffer(offer.id) continue primary_node = cluster.nodes.find(agent_role=const.AgentRole.PRIMARY) nodes = list(cluster.nodes) nodes.sort(key=lambda n: 1 if n is primary_node else 2) for i, node in enumerate(nodes): if primary_node is None: primary_node = node if node.data.status == const.ClusterStatus.CREATING: logger.info("Accepting offer for agent on [%s] for node %s" % (offer.hostname, node.data.memsql_role)) extras = {} if node.data.agent_role == const.AgentRole.FOLLOWER: extras['primary_host'] = primary_node.data.host extras['primary_memsql_port'] = primary_node.data.memsql_port host = utils.generate_host() task = self.make_agent_task( offer=offer, cpu=flavor.cpu, mem=flavor.memory_mb, disk=flavor.disk_mb, host_ip=socket.gethostbyname(offer.hostname), agent_host=host, agent_port=ports[0], memsql_port=ports[1], demo_port=ports[2], memsql_role=node.data.memsql_role, cluster_name=cluster.name, display_name=cluster.data.display_name, install_demo=cluster.data.install_demo, agent_version=cluster.data.agent_version, **extras) didathing = True driver.launchTasks(offer.id, [task]) node.save( task_id=task.task_id.value, status=const.ClusterStatus.WAITING_FOR_AGENTS, host=host, host_ip=socket.gethostbyname(offer.hostname), agent_port=ports[0], memsql_port=ports[1], demo_port=ports[2]) # save this data in the cluster if node.data.agent_role == const.AgentRole.PRIMARY: cluster.save( primary_host=offer.hostname, primary_agent_port=ports[0], primary_memsql_port=ports[1], primary_demo_port=ports[2] ) logger.info("Running agent on (%s) %s:%d, %d, %d" % (host, offer.hostname, ports[0], ports[1], ports[2])) break if i == len(nodes) - 1: cluster.save(status=const.ClusterStatus.WAITING_FOR_AGENTS) if not didathing: logger.debug("Not doing anything anymore, declining offer [%s]" % offer.id) driver.declineOffer(offer.id)