コード例 #1
0
ファイル: main.py プロジェクト: linearregression/memsql-mesos
def run(root_path):
    thread_manager, data_root, pool = setup()

    # Assert that we're running on Mesos 0.22.1 or above.
    try:
        version_url = "http://%s/state.json" % MESOS_MASTER_URL
        data = web_helpers.get_json_from_url(version_url)
        mesos_version = data["version"]
    except (web_helpers.GetJSONFromURLException, KeyError) as e:
        logger.error(
            "Error while trying to find version information for Mesos "
            "master %s: %s" % (MESOS_MASTER_URL, str(e)))
        sys.exit(1)
    try:
        mesos_version_tuple = tuple(int(x) for x in mesos_version.split('.'))
        if mesos_version_tuple < (0, 22, 1):
            logger.error(
                "Mesos master at %s is version %s, but we require at least "
                "Mesos version 0.22.1" % (MESOS_MASTER_URL, mesos_version))
            sys.exit(1)
    except ValueError:
        # If we can't parse the version tuple, we ignore it.
        pass

    framework = mesos_pb2.FrameworkInfo()
    framework.user = SCHEDULER_USER
    framework.name = SCHEDULER_NAME
    framework.role = SCHEDULER_ROLE
    framework.failover_timeout = FAILOVER_TIMEOUT
    framework.checkpoint = True
    framework.webui_url = "http://%s:%d" % (EXTERNAL_WEB_HOST, EXTERNAL_WEB_PORT)
    if data_root.meta.data.framework_id is not None:
        framework.id.value = data_root.meta.data.framework_id

    credential = None
    principal = os.environ.get("DEFAULT_PRINCIPAL", None)
    if MESOS_AUTHENTICATE and principal:
        secret = os.environ.get("DEFAULT_SECRET", None)
        framework.principal = principal
        credential = mesos_pb2.Credential()
        credential.principal = principal
        if secret is not None:
            credential.secret = secret

    implicit_acknowledgments = 1
    if os.getenv("MESOS_EXPLICIT_ACKNOWLEDGEMENTS"):
        implicit_acknowledgments = 0

    scheduler = MemSQLScheduler(data_root)
    if credential is not None:
        driver = MesosSchedulerDriver(scheduler, framework, MESOS_MASTER_URL, implicit_acknowledgments, credential)
    else:
        driver = MesosSchedulerDriver(scheduler, framework, MESOS_MASTER_URL, implicit_acknowledgments)

    thread_manager.add(ClusterMonitor, { "driver": driver, "data_root": data_root })
    thread_manager.add(WebServer, {
        "host": "0.0.0.0",
        "port": EXTERNAL_WEB_PORT,
        "root_path": root_path,
        "pool": pool
    })

    def shutdown(*args):
        global SHUTTING_DOWN
        SHUTTING_DOWN = True

        logger.info("MemSQL scheduler is shutting down")
        thread_manager.close(block=True)

        scheduler.shuttingDown = True

        def stop_timeout():
            wait_started = datetime.datetime.now()
            while scheduler.tasksRunning > 0 and SHUTDOWN_TIMEOUT > (datetime.datetime.now() - wait_started).total_seconds():
                time.sleep(1)

            if scheduler.tasksRunning > 0:
                logger.warning("Shutdown by timeout, %d task(s) have not completed" % scheduler.tasksRunning)

            driver.stop(True)

        stop_thread = Thread(target=stop_timeout)
        stop_thread.start()

    signal.signal(signal.SIGINT, shutdown)
    signal.signal(signal.SIGTERM, shutdown)

    try:
        thread_manager.start(timeout=START_TIMEOUT)
    except Exception:
        logger.error("Failed to start MemSQL Agent.", exc_info=True)
    else:
        logger.info("Exit with Ctrl-C")

        result = []

        # driver.run() blocks; we run it in a separate thread
        def run_driver_async():
            status = 0 if driver.run() == mesos_pb2.DRIVER_STOPPED else 1
            driver.stop(True)
            result.append(status)
        framework_thread = Thread(target=run_driver_async, args=())
        framework_thread.start()

        while framework_thread.is_alive():
            time.sleep(1)

        sys.exit(result[0])
    finally:
        if not SHUTTING_DOWN:
            shutdown()
コード例 #2
0
ファイル: scheduler.py プロジェクト: Vlad777/memsql-mesos
    def resourceOffers(self, driver, offers):
        logger.debug("Got %d offers" % len(offers))

        clusters = self.root.clusters
        logger.debug("Current clusters: %s" % clusters.records)

        for offer in offers:
            didathing = False
            logger.debug("Got resource offer [%s]" % offer.id.value)

            work = None
            for cluster in clusters:
                if cluster.data.status != const.ClusterStatus.CREATING:
                    continue
                work = cluster
                break

            if self.shuttingDown:
                logger.debug("Shutting down: declining offer on [%s]" % offer.hostname)
                driver.declineOffer(offer.id)
                continue
            elif work is None:
                logger.debug("Nothing to do, declining offer [%s]" % offer.id)
                driver.declineOffer(offer.id)
                continue

            cpus, mem, disk, ports = utils.get_resources(offer.resources, MEMSQL_SCHEDULER_ROLE)
            cluster = work

            try:
                data = web_helpers.get_json_from_url(CURRENT_AGENT_VERSION_URL)
                agent_version = data["version"]
            except (web_helpers.GetJSONFromURLException, KeyError) as e:
                logger.error(
                    "Error while getting newest version of MemSQL Agent: %s. "
                    "Rolling back cluster" % str(e))
                self._rollback_cluster(cluster)
                continue

            cluster.save(agent_version=agent_version)
            cluster.maybe_create_nodes()
            flavor = cluster.flavor

            if flavor.bigger_than(cpus, mem / 1024, disk / 1024) or len(ports) < AGENT_TASK_PORTS:
                logger.info("Not enough resources for agent task: "
                            "%s cpus, %s mem, %s disk, %s ports but "
                            "need %s, %s ports" % (
                                cpus, mem / 1024, disk / 1024, len(ports),
                                flavor, AGENT_TASK_PORTS))
                driver.declineOffer(offer.id)
                continue

            primary_node = cluster.nodes.find(agent_role=const.AgentRole.PRIMARY)

            nodes = list(cluster.nodes)
            nodes.sort(key=lambda n: 1 if n is primary_node else 2)

            for i, node in enumerate(nodes):
                if primary_node is None:
                    primary_node = node

                if node.data.status == const.ClusterStatus.CREATING:
                    logger.info("Accepting offer for agent on [%s] for node %s" % (offer.hostname, node.data.memsql_role))

                    extras = {}
                    if node.data.agent_role == const.AgentRole.FOLLOWER:
                        extras['primary_host'] = primary_node.data.host
                        extras['primary_memsql_port'] = primary_node.data.memsql_port

                    host = utils.generate_host()
                    task = self.make_agent_task(
                        offer=offer,
                        cpu=flavor.cpu,
                        mem=flavor.memory_mb,
                        disk=flavor.disk_mb,
                        host_ip=socket.gethostbyname(offer.hostname),
                        agent_host=host,
                        agent_port=ports[0],
                        memsql_port=ports[1],
                        demo_port=ports[2],
                        memsql_role=node.data.memsql_role,
                        cluster_name=cluster.name,
                        display_name=cluster.data.display_name,
                        install_demo=cluster.data.install_demo,
                        agent_version=cluster.data.agent_version,
                        **extras)
                    didathing = True
                    driver.launchTasks(offer.id, [task])

                    node.save(
                        task_id=task.task_id.value,
                        status=const.ClusterStatus.WAITING_FOR_AGENTS,
                        host=host,
                        host_ip=socket.gethostbyname(offer.hostname),
                        agent_port=ports[0],
                        memsql_port=ports[1],
                        demo_port=ports[2])

                    # save this data in the cluster
                    if node.data.agent_role == const.AgentRole.PRIMARY:
                        cluster.save(
                            primary_host=offer.hostname,
                            primary_agent_port=ports[0],
                            primary_memsql_port=ports[1],
                            primary_demo_port=ports[2]
                        )

                    logger.info("Running agent on (%s) %s:%d, %d, %d" % (host, offer.hostname, ports[0], ports[1], ports[2]))
                    break

                if i == len(nodes) - 1:
                    cluster.save(status=const.ClusterStatus.WAITING_FOR_AGENTS)

            if not didathing:
                logger.debug("Not doing anything anymore, declining offer [%s]" % offer.id)
                driver.declineOffer(offer.id)