Example #1
0
def launch_elsa(marathon, stats_file, scale_window):
    logging.info('Start monitoring the inbound traffic on topics using %s' %
                 (stats_file))
    # make sure the stats file is properly initialized:
    if not os.path.exists(stats_file):
        f = open(stats_file, 'w')
        f.write('0')
        f.close()

    # launch the Elsa app via Marathon
    c = MarathonClient(marathon)
    c.create_app(
        'elsa',
        MarathonApp(cmd='/home/vagrant/elsa/launch-elsa.sh',
                    mem=200,
                    cpus=1,
                    user='******'))
    # c.list_apps()

    print(
        'ElSA is deployed and running, waiting now 5 sec before starting auto-scale ...'
    )
    time.sleep(5)  # allow time to deploy before autoscaling sets in

    # kick off traffic monitoring and trigger autoscaling:
    previous_topic_traffic = 0
    try:
        while True:
            with open(stats_file, 'r') as elsa_file:
                topic_traffic = int(elsa_file.read())
                topic_traffic_diff = topic_traffic - previous_topic_traffic
                print('Difference in traffic in the past %d seconds: %d' %
                      (scale_window, topic_traffic_diff))
                previous_topic_traffic = topic_traffic

                current_instance_num = c.get_app('elsa').instances

                if topic_traffic_diff > TRAFFIC_INCREASE_THRESHOLD:  # we see a surge of traffic above threshold ...
                    instance_multiplier = int(
                        topic_traffic_diff /
                        SCALE_FACTOR)  # ... increase number of instances
                    c.scale_app('elsa',
                                current_instance_num * instance_multiplier)
                    print('Increasing number of instances to %d' %
                          (current_instance_num * instance_multiplier))
                elif topic_traffic_diff < 0:  # negative, back off exponentially
                    target_instance_num = int(current_instance_num / 2)
                    if target_instance_num > 1:
                        c.scale_app('elsa', target_instance_num)
                        print('Decreasing number of instances to %d' %
                              (target_instance_num))
                    else:
                        c.scale_app('elsa', 1)
                        print('Resetting number of instances to 1')
            time.sleep(scale_window)
    except KeyboardInterrupt:
        print(
            'ElSA has been stopped by user, halting app and rolling back deployment. Thanks and bye!'
        )
        c.delete_app('elsa', force=True)
Example #2
0
class MarathonDeployer(object):


    def __init__(self, marathon_url):
        self.url = marathon_url
        self.client = MarathonClient(self.url)

    def deploy(self, task_chain, environment_name):
        deployed_chain = DeployedTaskChain(task_chain, environment_name)
        for task in deployed_chain.list_all_tasks():
            task_id = task['id']
            safe_name = task_id.lower()
            # safe_name = task['name'].replace('.', '').lower()
            try:
                if self.client.get_app(safe_name):
                    self.client.delete_app(safe_name)
                    time.sleep(2)
            except Exception:
                pass

            app = MarathonApp(cmd='/var/riversnake/invoke.py {0} {1} {2}'.format(
                        task_chain.flow_name,
                        environment_name,
                        task_id),
                    mem=16, cpus=1)

            self.client.create_app(safe_name, app)
Example #3
0
def launch_qsf(marathon_url):
    logging.info('Launching QSF using %s' %(marathon_url))

    # launch via Marathon REST API
    c = MarathonClient(marathon_url)
    c.create_app('dromedar-qsf', MarathonApp(cmd='python dromedar-master/qsf.py %s' %(marathon_url), uris=['https://github.com/mhausenblas/dromedar/archive/master.zip'], mem=100, cpus=.5))
    
    logging.info('QSF up and running.')
Example #4
0
File: qsf.py Project: else/dromedar
def launch_drillbits(marathon_url, scale_factor):
    logging.info('Launching Drillbits using %s and scale factor %d' %(marathon_url, int(scale_factor)))

    # launch Drillbits via Marathon REST API
    c = MarathonClient(marathon_url)
    # c.create_app('dromedar-drill', MarathonApp(cmd='dromedar-master/launch-drillbit.sh', uris=['https://github.com/mhausenblas/dromedar/archive/master.zip'], mem=400, cpus=1))
    
    c.create_app('dromedar-drill', MarathonApp(cmd='sudo /opt/drill/apache-drill-0.8.0/bin/drillbit.sh start', mem=400, cpus=1))

    print('Drillbits are deployed: DATASETSIZE, NUM_DRILLBITS')
    
    httpd = SocketServer.TCPServer(("", QSF_PORT), SimpleHTTPServer.SimpleHTTPRequestHandler)
    logging.info('Now listening to change requests on port %d' %(QSF_PORT))
    httpd.serve_forever()
Example #5
0
def launch(service):
	print 'launching ' + service
	service_dict = data['services'][service]
	image = service_dict['image']
	try:
		ports = service_dict['ports'].values()
	except:
		ports = []
	instances = 1 if not service_dict.get('instances') else service_dict.get('instances')
	cpus = 0.3 if not service_dict.get('cpus') else service_dict.get('cpus')
	mem = 512 if not service_dict.get('mem') else service_dict.get('mem')
	#
	# env variables
	#
	env = {}
	env['ETCD_HOST_ADDRESS'] = data['etcd']['host']
	env['SERVICE_NAME'] = service
	# set up custom environment variables
	custom_env = service_dict.get('environment')
	if custom_env:
		for key in custom_env.keys():
			env[key] = custom_env[key]
	options = []
	constraints = []

	#
	# TODO add support for this
	#
	if service == "cassandra":
		options = ["-p", "7000:7000", "-p", "9042:9042", "-p", "9160:9160", "-p", "22000:22", "-p", "5000:5000"]
		ports = []
		constraints = [["hostname", "UNIQUE"]]
	#
	# set up marathon client and launch container
	#
	marathon_client = MarathonClient('http://' + str(data['marathon']['host']) + ':' + str(data['marathon']['port']))
	marathon_client.create_app(
		container = {
			"image" : str("docker:///"+image), 
			"options" : options
		},
		id = service,
		instances = str(instances),
		constraints = constraints,
		cpus = str(cpus),
		mem = str(mem),
		env = env,
		ports = ports #should be listed in order they appear in dockerfile
		
	)
def marathon_api_launch(image, options, marathon_app_id, instances, constraints, cpus, mem, env, ports):
	marathon_client = MarathonClient('http://' + str(marathon_host) + ':' + str(marathon_port))
	marathon_client.create_app(
		container = {
			"image" : str("docker:///"+image), 
			"options" : options
		},
		id = marathon_app_id,
		instances = str(instances),
		constraints = constraints,
		cpus = str(cpus),
		mem = str(mem),
		env = env,
		ports = ports #should be listed in order they appear in dockerfile
	)
	return marathon_app_id
Example #7
0
class MarathonCluster(object):
    def __init__(self, scheduler,
                 executable='dask-worker',
                 docker_image='mrocklin/dask-distributed:1.15.2',
                 marathon_address='http://localhost:8080',
                 name=None, **kwargs):
        self.scheduler = scheduler
        self.executor = ThreadPoolExecutor(1)

        # Create Marathon App to run dask-worker
        args = [executable, scheduler.address,
                '--name', '$MESOS_TASK_ID',  # use Mesos task ID as worker name
                '--worker-port', '$PORT_WORKER',
                '--bokeh-port', '$PORT_BOKEH',
                '--nanny-port', '$PORT_NANNY',
                '--http-port', '$PORT_HTTP']

        ports = [{'port': 0,
                  'protocol': 'tcp',
                  'name': name}
                 for name in ['worker', 'nanny', 'http', 'bokeh']]

        if 'mem' in kwargs:
            args.extend(['--memory-limit',
                         str(int(kwargs['mem'] * 0.6 * 1e6))])

        kwargs['cmd'] = ' '.join(args)
        container = MarathonContainer({'image': docker_image})

        app = MarathonApp(instances=0,
                          container=container,
                          port_definitions=ports,
                          **kwargs)

        # Connect and register app
        self.client = MarathonClient(marathon_address)
        self.app = self.client.create_app(name or 'dask-%s' % uuid.uuid4(), app)

    def scale_up(self, instances):
        self.executor.submit(self.client.scale_app,
                self.app.id, instances=instances)

    def scale_down(self, workers):
        for w in workers:
            self.executor.submit(self.client.kill_task,
                                 self.app.id,
                                 self.scheduler.worker_info[w]['name'],
                                 scale=True)

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def close(self):
        self.client.delete_app(self.app.id, force=True)
Example #8
0
def launch_elsa(marathon, stats_file, scale_window):
    logging.info('Start monitoring the inbound traffic on topics using %s' %(stats_file))
    # make sure the stats file is properly initialized:
    if not os.path.exists(stats_file):
        f = open(stats_file, 'w')
        f.write('0')
        f.close()
    
    # launch the Elsa app via Marathon
    c = MarathonClient(marathon)
    c.create_app('elsa', MarathonApp(cmd='/home/vagrant/elsa/launch-elsa.sh', mem=200, cpus=1, user='******'))
    # c.list_apps()
    
    print('ElSA is deployed and running, waiting now 5 sec before starting auto-scale ...')
    time.sleep(5) # allow time to deploy before autoscaling sets in
    
    # kick off traffic monitoring and trigger autoscaling:
    previous_topic_traffic = 0
    try:
        while True:
            with open(stats_file, 'r') as elsa_file:
                topic_traffic = int(elsa_file.read())
                topic_traffic_diff = topic_traffic - previous_topic_traffic
                print('Difference in traffic in the past %d seconds: %d' %(scale_window, topic_traffic_diff))
                previous_topic_traffic = topic_traffic
            
                current_instance_num = c.get_app('elsa').instances
            
                if topic_traffic_diff > TRAFFIC_INCREASE_THRESHOLD: # we see a surge of traffic above threshold ...
                    instance_multiplier = int(topic_traffic_diff / SCALE_FACTOR) # ... increase number of instances
                    c.scale_app('elsa', current_instance_num * instance_multiplier)
                    print('Increasing number of instances to %d' %(current_instance_num * instance_multiplier))
                elif topic_traffic_diff < 0: # negative, back off exponentially 
                    target_instance_num = int(current_instance_num/2)
                    if target_instance_num > 1:
                        c.scale_app('elsa', target_instance_num)
                        print('Decreasing number of instances to %d' %(target_instance_num))
                    else:
                        c.scale_app('elsa', 1)
                        print('Resetting number of instances to 1')
            time.sleep(scale_window)
    except KeyboardInterrupt:
        print('ElSA has been stopped by user, halting app and rolling back deployment. Thanks and bye!')
        c.delete_app('elsa', force=True)
class TestCreateApp(unittest.TestCase):
    """
    Test the creation of a Marathon app against a live endpoint. Configure MARATHON_SERVER in tests.config.
    """

    def setUp(self):
        self._app = get_app()  # Generate a random server configuration.
        self.client = MarathonClient(MARATHON_SERVER)
        self.client.create_app(app_id=self._app.id, app=self._app)
        time.sleep(2)  # Wait two seconds for the POST to be processed by Marathon.
        self.app = self.client.get_app(self._app.id)
        while not self.app.tasks_healthy:  # Wait until the app becomes healthy.
            self.app = self.client.get_app(self._app.id)
            time.sleep(1)

    def test_create(self):
        self.assertIsInstance(self.app, MarathonApp)
        self.assertIsInstance(self.app.upgrade_strategy, MarathonUpgradeStrategy)
        self.assertIsInstance(self.app.tasks.pop(), MarathonTask)
        self.assertIsInstance(self.app.health_checks.pop(), MarathonHealthCheck)

    def tearDown(self):
        self.client.delete_app(self.app.id, force=True)
Example #10
0
class Services(object):
    def __init__(self, endpoints):
        self.marathon = MarathonClient(endpoints)

    def list(self):
        return self.marathon.list_apps()

    def clean(self, pattern=None):
        apps = self.list()
        for app in apps:
            try:
                if pattern == None or re.match(pattern, app.id) != None:
                    logging.info("Deleting app: %s", app.id)
                    self.marathon.delete_app(app.id, force=True)
                else:
                    logging.info("Ignoring app %s. Did not match pattern %s",
                                 app.id, pattern)
            except:
                logger.info("Unable to delete app %s", app.id)
                traceback.print_exc()

    def register_services(self, service_registry="conf/marathon"):
        for app_def in glob.glob(os.path.join(service_registry, "*json")):
            with open(app_def, "r") as stream:
                args = json.loads(stream.read())
                app_id = args['id']
                args = Names.snake_case(args)
                logger.debug("Creating service: %s", json.dumps(args,
                                                                indent=2))
                args['tasks'] = []
                app = MarathonApp(**args)
                try:
                    logging.info("Creating app [id=>{0}]".format(app_id))
                    self.marathon.create_app(app_id, app)
                except:
                    traceback.print_exc()
def new_deploy(app_name, app_file):
    """Calls marathon API to make new deployment of application given file as request body

    :param app_name:
    :param app_file:
    :return:
    """
    marathon_addresses = _addresses()
    with open(app_file, 'r') as content_file:
        content = content_file.read()
    app_attr = json.loads(content)
    cli = MarathonClient(marathon_addresses)
    if not _is_deployed(cli, app_name):
        m_app = models.MarathonApp.from_json(app_attr)
        created_app = cli.create_app(app_name, m_app)
        return created_app.to_json()
    else:
        return None
Example #12
0
def new_deploy(app_name, app_file):
    """Calls marathon API to make new deployment of application given file as request body

    :param app_name:
    :param app_file:
    :return:
    """
    marathon_addresses = _addresses()
    with open(app_file, 'r') as content_file:
        content = content_file.read()
    app_attr = json.loads(content)
    cli = MarathonClient(marathon_addresses)
    if not _is_deployed(cli, app_name):
        m_app = models.MarathonApp.from_json(app_attr)
        created_app = cli.create_app(app_name, m_app)
        return created_app.to_json()
    else:
        return None
Example #13
0
def _create_application(client: MarathonClient, app: MarathonApp,
                        definition_path: str) -> Union[str, bool]:
    print('\nCreating app: {} (from: {})'.format(app.id, definition_path))
    try:
        app = client.create_app(app.id, app)
        if app is False:
            print('Deployment of {} failed'.format(app.id))
            sys.exit(1)
    except MarathonHttpError as error:
        if error.status_code == 409:
            # If somehow didn't come up before...
            print('Application already exists. Updating...')
            return _update_application(client, app, definition_path)
        raise error
    # TODO: Migrate to `wait_for_deployment`
    # Return the deployed appid to build rollback order, if necessary
    # or False if the creation was cancelled
    return False if not poll_deployments_for_app(client, app) else app.id
Example #14
0
class MarathonHTTPClient(object):

    def __init__(self, target, auth, options, pkey):
        self.target = settings.MARATHON_HOST
        self.auth = auth
        self.options = options
        self.pkey = pkey
        self.registry = settings.REGISTRY_HOST + ':' + settings.REGISTRY_PORT
        self.client = MarathonClient('http://'+self.target+':8180')
        self.fleet = FleetHTTPClient('/var/run/fleet.sock', auth, options, pkey)

    # helpers
    def _app_id(self, name):
        return name.replace('_', '.')

    # container api
    def create(self, name, image, command='', **kwargs):
        """Create a container"""
        app_id = self._app_id(name)
        l = locals().copy()
        l.update(re.match(MATCH, name).groupdict())
        image = self.registry + '/' + image
        mems = kwargs.get('memory', {}).get(l['c_type'])
        m = 0
        if mems:
            mems = mems.lower()
            if mems[-2:-1].isalpha() and mems[-1].isalpha():
                mems = mems[:-1]
            m = int(mems[:-1])
        c = 0.5
        cpu = kwargs.get('cpu', {}).get(l['c_type'])
        if cpu:
            c = cpu
        cmd = "docker run --name {name} -P {image} {command}".format(**locals())
        self.client.create_app(app_id, MarathonApp(cmd=cmd, mem=m, cpus=c))
        self.client.scale_app(app_id, 0, force=True)
        for _ in xrange(POLL_ATTEMPTS):
            if self.client.get_app(self._app_id(name)).tasks_running == 0:
                return
            time.sleep(1)

    def start(self, name):
        """Start a container"""
        self.client.scale_app(self._app_id(name), 1, force=True)
        for _ in xrange(POLL_ATTEMPTS):
            if self.client.get_app(self._app_id(name)).tasks_running == 1:
                break
            time.sleep(1)
        host = self.client.get_app(self._app_id(name)).tasks[0].host
        self._waitforcontainer(host, name)

    def stop(self, name):
        """Stop a container"""
        raise NotImplementedError

    def destroy(self, name):
        """Destroy a container"""
        try:
            host = self.client.get_app(self._app_id(name)).tasks[0].host
            self.client.delete_app(self._app_id(name), force=True)
            self._delete_container(host, name)
        except:
            self.client.delete_app(self._app_id(name), force=True)

    def _get_container_state(self, host, name):
        docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version='1.17')
        try:
            if docker_cli.inspect_container(name)['State']['Running']:
                return JobState.up
        except:
            return JobState.destroyed

    def _waitforcontainer(self, host, name):
        for _ in xrange(POLL_WAIT):
            if self._get_container_state(host, name) == JobState.up:
                return
            time.sleep(1)
        raise RuntimeError("App container Not Started")

    def _delete_container(self, host, name):
        docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version='1.17')
        if docker_cli.inspect_container(name)['State']:
            docker_cli.remove_container(name, force=True)

    def run(self, name, image, entrypoint, command):  # noqa
        """Run a one-off command"""
        return self.fleet.run(name, image, entrypoint, command)

    def state(self, name):
        try:
            for _ in xrange(POLL_ATTEMPTS):
                if self.client.get_app(self._app_id(name)).tasks_running == 1:
                    return JobState.up
                elif self.client.get_app(self._app_id(name)).tasks_running == 0:
                    return JobState.created
                time.sleep(1)
        except:
            return JobState.destroyed

    def attach(self, name):
        """
        Attach to a job's stdin, stdout and stderr
        """
        raise NotImplementedError
Example #15
0
class MarathonSpawner(Spawner):

    app_image = Unicode("jupyterhub/singleuser", config=True)

    app_prefix = Unicode(
        "jupyter",
        help=dedent(
            """
            Prefix for app names. The full app name for a particular
            user will be <prefix>/<username>.
            """
        )
    ).tag(config=True)

    marathon_host = Unicode(
        u'',
        help="Hostname of Marathon server").tag(config=True)

    marathon_constraints = List(
        [],
        help='Constraints to be passed through to Marathon').tag(config=True)

    ports = List(
        [8888],
        help='Ports to expose externally'
        ).tag(config=True)

    volumes = List(
        [],
        help=dedent(
            """
            A list in Marathon REST API format for mounting volumes into the docker container.
            [
                {
                    "containerPath": "/foo",
                    "hostPath": "/bar",
                    "mode": "RW"
                }
            ]

            Note that using the template variable {username} in containerPath,
            hostPath or the name variable in case it's an external drive
            it will be replaced with the current user's name.
            """
        )
    ).tag(config=True)

    network_mode = Unicode(
        'BRIDGE',
        help="Enum of BRIDGE or HOST"
        ).tag(config=True)

    hub_ip_connect = Unicode(
        "",
        help="Public IP address of the hub"
        ).tag(config=True)

    hub_port_connect = Integer(
        -1,
        help="Public PORT of the hub"
        ).tag(config=True)

    format_volume_name = Any(
        help="""Any callable that accepts a string template and a Spawner
        instance as parameters in that order and returns a string.
        """
    ).tag(config=True)

    @default('format_volume_name')
    def _get_default_format_volume_name(self):
        return default_format_volume_name

    _executor = None
    @property
    def executor(self):
        cls = self.__class__
        if cls._executor is None:
            cls._executor = ThreadPoolExecutor(1)
        return cls._executor

    def __init__(self, *args, **kwargs):
        super(MarathonSpawner, self).__init__(*args, **kwargs)
        self.marathon = MarathonClient(self.marathon_host)

    @property
    def container_name(self):
        return '/%s/%s' % (self.app_prefix, self.user.name)

    def get_state(self):
        state = super(MarathonSpawner, self).get_state()
        state['container_name'] = self.container_name
        return state

    def load_state(self, state):
        if 'container_name' in state:
            pass

    def get_health_checks(self):
        health_checks = []
        health_checks.append(MarathonHealthCheck(
            protocol='TCP',
            port_index=0,
            grace_period_seconds=300,
            interval_seconds=60,
            timeout_seconds=20,
            max_consecutive_failures=0
            ))
        return health_checks

    def get_volumes(self):
        volumes = []
        for v in self.volumes:
            mv = MarathonContainerVolume.from_json(v)
            mv.container_path = self.format_volume_name(mv.container_path, self)
            mv.host_path = self.format_volume_name(mv.host_path, self)
            if mv.external and 'name' in mv.external:
                mv.external['name'] = self.format_volume_name(mv.external['name'], self)
            volumes.append(mv)
        return volumes

    def get_port_mappings(self):
        port_mappings = []
        for p in self.ports:
            port_mappings.append(
                MarathonContainerPortMapping(
                    container_port=p,
                    host_port=0,
                    protocol='tcp'
                )
            )
        return port_mappings

    def get_constraints(self):
        constraints = []
        for c in self.marathon_constraints:
            constraints.append(MarathonConstraint.from_json(c))

    @run_on_executor
    def get_deployment(self, deployment_id):
        deployments = self.marathon.list_deployments()
        for d in deployments:
            if d.id == deployment_id:
                return d
        return None

    @run_on_executor
    def get_deployment_for_app(self, app_name):
        deployments = self.marathon.list_deployments()
        for d in deployments:
            if app_name in d.affected_apps:
                return d
        return None

    def get_ip_and_port(self, app_info):
        assert len(app_info.tasks) == 1
        ip = socket.gethostbyname(app_info.tasks[0].host)
        return (ip, app_info.tasks[0].ports[0])

    @run_on_executor
    def get_app_info(self, app_name):
        try:
            app = self.marathon.get_app(app_name, embed_tasks=True)
        except NotFoundError:
            self.log.info("The %s application has not been started yet", app_name)
            return None
        else:
            return app

    def _public_hub_api_url(self):
        uri = urlparse(self.hub.api_url)
        port = self.hub_port_connect if self.hub_port_connect > 0 else uri.port
        ip = self.hub_ip_connect if self.hub_ip_connect else uri.hostname
        return urlunparse((
            uri.scheme,
            '%s:%s' % (ip, port),
            uri.path,
            uri.params,
            uri.query,
            uri.fragment
            ))

    def get_env(self):
        env = super(MarathonSpawner, self).get_env()
        env.update(dict(
            # Jupyter Hub config
            JPY_USER=self.user.name,
            JPY_COOKIE_NAME=self.user.server.cookie_name,
            JPY_BASE_URL=self.user.server.base_url,
            JPY_HUB_PREFIX=self.hub.server.base_url,
        ))

        if self.notebook_dir:
            env['NOTEBOOK_DIR'] = self.notebook_dir

        if self.hub_ip_connect or self.hub_port_connect > 0:
            hub_api_url = self._public_hub_api_url()
        else:
            hub_api_url = self.hub.api_url
        env['JPY_HUB_API_URL'] = hub_api_url
        return env

    @gen.coroutine
    def start(self):
        docker_container = MarathonDockerContainer(
            image=self.app_image,
            network=self.network_mode,
            port_mappings=self.get_port_mappings())

        app_container = MarathonContainer(
            docker=docker_container,
            type='DOCKER',
            volumes=self.get_volumes())

        # the memory request in marathon is in MiB
        if hasattr(self, 'mem_limit') and self.mem_limit is not None:
            mem_request = self.mem_limit / 1024.0 / 1024.0
        else:
            mem_request = 1024.0

        app_request = MarathonApp(
            id=self.container_name,
            env=self.get_env(),
            cpus=self.cpu_limit,
            mem=mem_request,
            container=app_container,
            constraints=self.get_constraints(),
            health_checks=self.get_health_checks(),
            instances=1
            )

        app = self.marathon.create_app(self.container_name, app_request)
        if app is False or app.deployments is None:
            self.log.error("Failed to create application for %s", self.container_name)
            return None

        while True:
            app_info = yield self.get_app_info(self.container_name)
            if app_info and app_info.tasks_healthy == 1:
                ip, port = self.get_ip_and_port(app_info)
                break
            yield gen.sleep(1)
        return (ip, port)

    @gen.coroutine
    def stop(self, now=False):
        try:
            status = self.marathon.delete_app(self.container_name)
        except:
            self.log.error("Could not delete application %s", self.container_name)
            raise
        else:
            if not now:
                while True:
                    deployment = yield self.get_deployment(status['deploymentId'])
                    if deployment is None:
                        break
                    yield gen.sleep(1)

    @gen.coroutine
    def poll(self):
        deployment = yield self.get_deployment_for_app(self.container_name)
        if deployment:
            for current_action in deployment.current_actions:
                if current_action.action == 'StopApplication':
                    self.log.error("Application %s is shutting down", self.container_name)
                    return 1
            return None

        app_info = yield self.get_app_info(self.container_name)
        if app_info and app_info.tasks_healthy == 1:
            return None
        return 0
Example #16
0
File: mmapi.py Project: annym/hydra
class MarathonIF(object):
    def __init__(self, marathon_addr, my_addr, mesos):
        self.mcli = MarathonClient(marathon_addr)
        self.myAddr = my_addr
        self.mesos = mesos

    def get_apps(self):
        listapps = self.mcli.list_apps()
        return listapps

    def get_app(self, app_id):
        try:
            a = self.mcli.get_app(app_id)
        except marathon.exceptions.NotFoundError as e:  # NOQA
            return None
        return a

    def delete_app(self, app_id, force=False):
        return self.mcli.delete_app(app_id, force)

    def delete_deployment(self, dep_id):
        return self.mcli.delete_deployment(dep_id)

    def get_deployments(self):
        return self.mcli.list_deployments()

    def delete_app_ifexisting(self, app_id, trys=4):
        for idx in range(0, trys):
            try:
                a = self.get_app(app_id)
                if a:
                    return self.delete_app(app_id)
                return None
            except:
                e = sys.exc_info()[0]
                pprint("<p>Error: %s</p>" % e)
                time.sleep(10)
        raise

    @staticmethod
    def is_valid_app_id(app_id):
        # allowed: lowercase letters, digits, hyphens, slash, dot
        if re.match("^[A-Za-z0-9-/.]*$", app_id):
            return True
        return False

    def create_app(self, app_id, attr):
        """
            Create and start an app.
            :param app_id: (str) - Application ID
            :param attr: marathon.models.app.MarathonApp application to create.
            :return: the created app
        """
        # Validate that app_id conforms to allowed naming scheme.
        if not self.is_valid_app_id(app_id):
            l.error("Error: Only lowercase letters, digits, hyphens are allowed in app_id. %s" % app_id)
            raise Exception("Invalid app_id")

        for idx in range(0, 10):
            try:
                a = self.mcli.create_app(app_id, attr)
                return a
            except marathon.exceptions.MarathonHttpError as e:
                if str(e).find('App is locked by one or more deployments. Override with the option') >= 0:
                    time.sleep(1)
                else:
                    raise
        raise

    def wait_app_removal(self, app):
        cnt = 0
        while True:
            if not self.get_app(app):
                break
            time.sleep(0.2)
            cnt += 1
            if cnt > 0:
                l.info("Stuck waiting for %s to be deleted CNT=%d" % (app, cnt))
        return True

    def wait_app_ready(self, app, running_count):
        cnt = 0
        while True:
            a1 = self.get_app(app)
            if a1.tasks_running == running_count:
                return a1
            cnt += 1
            time.sleep(1)
            if (cnt % 30) == 29:
                l.info("[%d]Waiting for task to move to running stage, " % cnt +
                       "current stat staged=%d running=%d expected Running=%d" %
                       (a1.tasks_staged, a1.tasks_running, running_count))

    def scale_app(self, app, scale):
        return self.mcli.scale_app(app, scale)

    def ping(self):
        return self.mcli.ping()
Example #17
0
class MarathonIF(object):
    def __init__(self, marathon_addr, my_addr, mesos):
        self.mcli = MarathonClient(marathon_addr)
        self.myAddr = my_addr
        self.mesos = mesos

    def get_apps(self):
        listapps = self.mcli.list_apps()
        return listapps

    def get_app(self, app_id):
        try:
            a = self.mcli.get_app(app_id)
        except marathon.exceptions.NotFoundError as e:  # NOQA
            return None
        return a

    def delete_app(self, app_id, force=False):
        return self.mcli.delete_app(app_id, force)

    def delete_deployment(self, dep_id):
        return self.mcli.delete_deployment(dep_id)

    def get_deployments(self):
        return self.mcli.list_deployments()

    def delete_app_ifexisting(self, app_id, trys=4):
        for idx in range(0, trys):
            try:
                a = self.get_app(app_id)
                if a:
                    return self.delete_app(app_id)
                return None
            except:
                e = sys.exc_info()[0]
                pprint("<p>Error: %s</p>" % e)
                time.sleep(10)
        raise

    def create_app(self, app_id, attr):
        for idx in range(0, 10):
            try:
                a = self.mcli.create_app(app_id, attr)
                return a
            except marathon.exceptions.MarathonHttpError as e:
                if str(e).find('App is locked by one or more deployments. Override with the option') >= 0:
                    time.sleep(1)
                else:
                    raise
        raise

    def wait_app_removal(self, app):
        cnt = 0
        while True:
            if not self.get_app(app):
                break
            time.sleep(0.2)
            cnt += 1
            if cnt > 0:
                l.info("Stuck waiting for %s to be deleted CNT=%d" % (app, cnt))
        return True

    def wait_app_ready(self, app, running_count):
        cnt = 0
        while True:
            a1 = self.get_app(app)
            if a1.tasks_running == running_count:
                return a1
            cnt += 1
            time.sleep(1)
            if (cnt % 30) == 29:
                l.info("[%d]Waiting for task to move to running stage, " % cnt +
                       "current stat staged=%d running=%d expected Running=%d" %
                       (a1.tasks_staged, a1.tasks_running, running_count))

    def scale_app(self, app, scale):
        return self.mcli.scale_app(app, scale)

    def ping(self):
        return self.mcli.ping()
Example #18
0
class MarathonHTTPClient(AbstractSchedulerClient):
    def __init__(self, target, auth, options, pkey):
        super(MarathonHTTPClient, self).__init__(target, auth, options, pkey)
        self.target = settings.MARATHON_HOST
        self.registry = settings.REGISTRY_HOST + ':' + settings.REGISTRY_PORT
        self.client = MarathonClient('http://' + self.target + ':8180')
        self.fleet = FleetHTTPClient('/var/run/fleet.sock', auth, options,
                                     pkey)

    # helpers
    def _app_id(self, name):
        return name.replace('_', '.')

    # container api
    def create(self, name, image, command='', **kwargs):
        """Create a new container"""
        app_id = self._app_id(name)
        l = locals().copy()
        l.update(re.match(MATCH, name).groupdict())
        image = self.registry + '/' + image
        mems = kwargs.get('memory', {}).get(l['c_type'])
        m = 0
        if mems:
            mems = mems.lower()
            if mems[-2:-1].isalpha() and mems[-1].isalpha():
                mems = mems[:-1]
            m = int(mems[:-1])
        c = 0.5
        cpu = kwargs.get('cpu', {}).get(l['c_type'])
        if cpu:
            c = cpu
        cmd = "docker run --name {name} -P {image} {command}".format(
            **locals())
        self.client.create_app(
            app_id, MarathonApp(cmd=cmd, mem=m, cpus=c, instances=0))
        for _ in xrange(POLL_ATTEMPTS):
            if self.client.get_app(self._app_id(name)).tasks_running == 0:
                return
            time.sleep(1)

    def start(self, name):
        """Start a container."""
        self.client.scale_app(self._app_id(name), 1, force=True)
        for _ in xrange(POLL_ATTEMPTS):
            if self.client.get_app(self._app_id(name)).tasks_running == 1:
                break
            time.sleep(1)
        host = self.client.get_app(self._app_id(name)).tasks[0].host
        self._waitforcontainer(host, name)

    def destroy(self, name):
        """Destroy a container."""
        try:
            host = self.client.get_app(self._app_id(name)).tasks[0].host
            self.client.delete_app(self._app_id(name), force=True)
            self._delete_container(host, name)
        except:
            self.client.delete_app(self._app_id(name), force=True)

    def _get_container_state(self, host, name):
        docker_cli = Client("tcp://{}:2375".format(host),
                            timeout=1200,
                            version='1.17')
        try:
            if docker_cli.inspect_container(name)['State']['Running']:
                return JobState.up
        except:
            return JobState.destroyed

    def _waitforcontainer(self, host, name):
        for _ in xrange(POLL_WAIT):
            if self._get_container_state(host, name) == JobState.up:
                return
            time.sleep(1)
        raise RuntimeError("App container Not Started")

    def _delete_container(self, host, name):
        docker_cli = Client("tcp://{}:2375".format(host),
                            timeout=1200,
                            version='1.17')
        if docker_cli.inspect_container(name)['State']:
            docker_cli.remove_container(name, force=True)

    def run(self, name, image, entrypoint, command):  # noqa
        """Run a one-off command."""
        return self.fleet.run(name, image, entrypoint, command)

    def state(self, name):
        """Display the given job's running state."""
        try:
            for _ in xrange(POLL_ATTEMPTS):
                if self.client.get_app(self._app_id(name)).tasks_running == 1:
                    return JobState.up
                elif self.client.get_app(
                        self._app_id(name)).tasks_running == 0:
                    return JobState.created
                time.sleep(1)
        except:
            return JobState.destroyed
class HealthCheckBencher(object):
    def __init__(self, marathon_url, image, tasks):
        self.concurrency = 20
        self.docker_image = image
        self.app_base_name = 'health-check-test-'
        self.total_tasks_cout = int(tasks)
        self.instances_per_app = 50
        if tasks < self.instances_per_app:
            self.instances_per_app = self.total_tasks_cout
            self.app_count = 1
        else:
            self.app_count = self.total_tasks_cout/self.instances_per_app
        self.heath_check_interval = 30
        self.test_duration = 20
        self.marathon_cluster = MarathonClient(marathon_url, timeout=240)
        self.work_queue = Queue()
        self.result_queue = Queue()
        self.app_list_queue = Queue()
        self.action_list = [self.start_collect,
                            'sleep={}'.format(self.test_duration),
                            self.get_stats]

    def remove_apps(self):
        apps = self.marathon_cluster.list_apps()
        for app in apps:
            if app.id.startswith("/"+self.app_base_name):
                self.marathon_cluster.delete_app(app.id)
        active = 0
        while True:
            apps = self.marathon_cluster.list_apps()
            for app in apps:
                if app.id.startswith(self.app_base_name):
                    active += 1
            if active == 0:
                break

    def create_app(self, id):
        port_mapping = MarathonContainerPortMapping(container_port=80,
                                                    protocol="tcp")
        app_docker = MarathonDockerContainer(
            image=self.docker_image,
            network="BRIDGE",
            force_pull_image=True,
            port_mappings=[port_mapping])
        app_container = MarathonContainer(docker=app_docker)
        http_health_check = MarathonHealthCheck(
            protocol="HTTP",
            path="/status",
            grace_period_seconds=300,
            interval_seconds=self.heath_check_interval,
            timeout_seconds=20,
            max_consecutive_failures=0
        )

        app_suffix = str(md5(str(random())).hexdigest())
        app_name = self.app_base_name + app_suffix
        new_app = MarathonApp(cpus=CPUS, mem=MEM, disk=DISK,
                              container=app_container,
                              health_checks=[http_health_check],
                              instances=self.instances_per_app,
                              max_launch_delay_seconds=5)
        print("Creating {}".format(app_name))
        self.marathon_cluster.create_app(app_id=app_name, app=new_app)
        self.app_list_queue.put(app_name)
        return None

    def wait_instances(self, app_name):
        health_ok = 0
        while health_ok < self.instances_per_app:
            health_ok = 0
            tasks = self.marathon_cluster.list_tasks(app_name)
            for task in tasks:
                if task.health_check_results:
                    health_ok += 1

    def start_collect(self, task):
        url = 'http://'+task['host']+':'+str(task['port'])+'/start_collect'
        res = urlopen(url)
        if res.getcode() == 200:
            print(task['id']+': collecter was started')
        else:
            print(task['id']+': failed to start collecter')

    def stop_collect(self, task):
        url = 'http://'+task['host']+':'+str(task['port'])+'/stop_collect'
        res = urlopen(url)
        if res.getcode() == 200:
            print(task['id']+': collecter was stopped')
        else:
            print(task['id']+': failed to stop collecter')

    def clear_stats(self, task):
        url = 'http://'+task['host']+':'+str(task['port'])+'/clear_stats'
        res = urlopen(url)
        if res.getcode() == 200:
            print(task['id']+': stats was dropped')
        else:
            print(task['id']+': stats was dropped')

    def get_stats(self, task):
        url = 'http://'+task['host']+':'+str(task['port'])+'/get_timestamps'
        try:
            res = urlopen(url)
        except Exception:
            print("URL req failed")
            self.result_queue.put({'id': task['id'],
                                   'status': 'Failed',
                                   'data': []})
            return
        if res.getcode() == 200:
            data = res.read()
            timestamps = data.split(',')
            self.result_queue.put({'id': task['id'],
                                   'status': 'ok',
                                   'data': timestamps})
        elif res.getcode() == 202:
            print("Collecting is not enabled")
            self.result_queue.put({'id': task['id'],
                                   'status': 'Collecting is not enabled',
                                   'data': []})
        else:
            print("Unknown response code")
            self.result_queue.put({'id': task['id'],
                                   'status': 'Unknown response code',
                                   'data': []})

    def repeat(self, action):
        while self.work_queue.empty() is False:
            try:
                iteration = self.work_queue.get_nowait()
            except Empty:
                continue
            action(iteration)
            self.work_queue.task_done()

    def fill_queue(self, iterations):
        for iteration in iterations:
            self.work_queue.put(iteration)

    def get_tasks(self):
        res = []
        tasks = self.marathon_cluster.list_tasks()
        for task in tasks:
            if not task.id.startswith('health-check-test-'):
                continue
            res.append({'id': str(task.id),
                        'host': str(task.host),
                        'port': str(task.ports[0])})
        return res

    def create_apps(self):
        self.fill_queue(range(self.app_count))
        for thread_num in range(self.concurrency):
            if self.work_queue.empty() is True:
                break
            worker = Thread(target=self.repeat, args=(self.create_app,))
            worker.start()
        self.work_queue.join()

        while self.app_list_queue.empty() is False:
            try:
                app_name = self.app_list_queue.get_nowait()
            except Empty:
                continue
            self.work_queue.put(app_name)

        for thread_num in range(self.concurrency):
            if self.work_queue.empty() is True:
                break
            worker = Thread(target=self.repeat, args=(self.wait_instances,))
            worker.start()
        self.work_queue.join()

    def start_test(self):
        task_list = self.get_tasks()
        for action in self.action_list:
            if isinstance(action, basestring):
                if action.startswith('sleep='):
                    amount = int(action.split('=')[1])
                    sleep(60*amount)
                continue
            self.fill_queue(task_list)
            for thread_num in range(self.concurrency):
                if self.work_queue.empty() is True:
                    break
                worker = Thread(target=self.repeat, args=(action,))
                worker.start()
            self.work_queue.join()

    def generate_report(self):
        today = datetime.today()
        file_prefix = "{:%Y-%m-%d_%H_%M_%S-}".format(today)
        file_name = (file_prefix +
                     'health_check_result-' +
                     str(self.total_tasks_cout) +
                     'tasks.csv')

        f = open(file_name, "w")
        f.write("Task ID,Health check timestamp")

        while self.result_queue.empty() is False:
            try:
                result = self.result_queue.get_nowait()
            except Empty:
                continue
            for timestamp in result['data']:
                f.write("\n%s,%s" % (result['id'], timestamp))

        f.close()
Example #20
0
 def create_app_from_json(self, json_data):
     a = MarathonApp.from_json(json_data)
     return MarathonClient.create_app(self, a.id, a)
Example #21
0
class MarathonIF(object):
    def __init__(self, marathon_addr, my_addr, mesos):
        self.mcli = MarathonClient(marathon_addr)
        self.myAddr = my_addr
        self.mesos = mesos

    def get_apps(self):
        listapps = self.mcli.list_apps()
        return listapps

    def get_app(self, app_id, timeout=300):
        st_time = time.time()
        while(time.time() - st_time < timeout):
            try:
                try:
                    a = self.mcli.get_app(app_id)
                except marathon.exceptions.NotFoundError as e:  # NOQA
                    return None
                return a
            except:
                l.info("mcli: get_app returned error")
                l.info(traceback.format_exc())
                l.info("Retrying after 10 secs timeout=%d", timeout)
                time.sleep(10)
        raise Exception("mcli get_app timed out, possible zookeper/marathon/mesos malfunction")

    def delete_app(self, app_id, force=False, timeout=200):
        st_time = time.time()
        while(time.time() - st_time < timeout):
            try:
                self.mcli.delete_app(app_id, force)
                return
            except:
                l.info("mcli: delete_app returned error")
                l.info(traceback.format_exc())
                l.info("Retrying after 10 secs timeout=%d", timeout)
                time.sleep(10)
        raise Exception("mcli delete_app timed out, possible zookeper/marathon/mesos malfunction")

    def delete_deployment(self, dep_id):
        return self.mcli.delete_deployment(dep_id)

    def get_deployments(self):
        return self.mcli.list_deployments()

    def delete_app_ifexisting(self, app_id, trys=4):
        for idx in range(0, trys):
            try:
                a = self.get_app(app_id)
                if a:
                    return self.delete_app(app_id)
                return None
            except:
                e = sys.exc_info()[0]
                pprint("<p>Error: %s</p>" % e)
                time.sleep(10)
        raise

    @staticmethod
    def is_valid_app_id(app_id):
        # allowed: lowercase letters, digits, hyphens, slash, dot
        if re.match("^[A-Za-z0-9-/.]*$", app_id):
            return True
        return False

    def create_app(self, app_id, attr):
        """
            Create and start an app.
            :param app_id: (str) - Application ID
            :param attr: marathon.models.app.MarathonApp application to create.
            :return: the created app
        """
        # Validate that app_id conforms to allowed naming scheme.
        if not self.is_valid_app_id(app_id):
            l.error("Error: Only lowercase letters, digits, hyphens are allowed in app_id. %s" % app_id)
            raise Exception("Invalid app_id")

        for idx in range(0, 10):
            try:
                a = self.mcli.create_app(app_id, attr)
                return a
            except marathon.exceptions.MarathonHttpError as e:
                if str(e).find('App is locked by one or more deployments. Override with the option') >= 0:
                    time.sleep(1)
                else:
                    raise
        raise

    def wait_app_removal(self, app):
        cnt = 0
        while True:
            if not self.get_app(app):
                break
            time.sleep(0.2)
            cnt += 1
            if cnt > 0:
                l.info("Stuck waiting for %s to be deleted CNT=%d" % (app, cnt))
        return True

    def wait_app_ready(self, app, running_count, sleep_before_next_try=1):
        cnt = 0
        while True:
            a1 = self.get_app(app)
            # if tasks_running are greater (due to whatever reason, scale down accordingly)
            if a1.tasks_running > running_count:
                delta = a1.tasks_running - running_count
                l.info("Found [%d] more apps, scaling down to [%d]", delta, running_count)
                self.scale_app(app, running_count)
                # Allow for some time before next poll
                time.sleep(1)
                continue
            if a1.tasks_running == running_count:
                return a1
            cnt += 1
            time.sleep(sleep_before_next_try)
            if (cnt % 30) == 29:
                l.info("[%d]Waiting for task to move to running stage, " % cnt +
                       "current stat staged=%d running=%d expected Running=%d" %
                       (a1.tasks_staged, a1.tasks_running, running_count))

    def scale_app(self, app, scale, timeout=300):
        st_time = time.time()
        while(time.time() - st_time < timeout):
            try:
                self.mcli.scale_app(app, scale)
                return
            except:
                l.info("mcli: scale_app returned error")
                l.info(traceback.format_exc())
                l.info("Retrying after 10 secs timeout=%d", timeout)
                time.sleep(10)
        raise Exception("mcli scale_app timed out, possible zookeper/marathon/mesos malfunction")

    def ping(self):
        return self.mcli.ping()

    def kill_task(self, app_id, task_id):
        return self.mcli.kill_task(app_id, task_id)
class MarathonSpawner(Spawner):

    app_image = Unicode("jupyterhub/singleuser:%s" % _jupyterhub_xy,
                        config=True)

    app_prefix = Unicode("jupyter",
                         help=dedent("""
            Prefix for app names. The full app name for a particular
            user will be <prefix>/<username>.
            """)).tag(config=True)

    marathon_host = Unicode(
        u'', help="Hostname of Marathon server").tag(config=True)

    marathon_constraints = List(
        [],
        help='Constraints to be passed through to Marathon').tag(config=True)

    ports = List([8888], help='Ports to expose externally').tag(config=True)

    volumes = List([],
                   help=dedent("""
            A list in Marathon REST API format for mounting volumes into the docker container.
            [
                {
                    "containerPath": "/foo",
                    "hostPath": "/bar",
                    "mode": "RW"
                }
            ]

            Note that using the template variable {username} in containerPath,
            hostPath or the name variable in case it's an external drive
            it will be replaced with the current user's name.
            """)).tag(config=True)

    network_mode = Unicode('BRIDGE',
                           help="Enum of BRIDGE or HOST").tag(config=True)

    hub_ip_connect = Unicode(
        "", help="Public IP address of the hub").tag(config=True)

    @observe('hub_ip_connect')
    def _ip_connect_changed(self, change):
        if jupyterhub.version_info >= (0, 8):
            warnings.warn(
                "MarathonSpawner.hub_ip_connect is no longer needed with JupyterHub 0.8."
                "  Use JupyterHub.hub_connect_ip instead.",
                DeprecationWarning,
            )

    hub_port_connect = Integer(-1,
                               help="Public PORT of the hub").tag(config=True)

    @observe('hub_port_connect')
    def _port_connect_changed(self, change):
        if jupyterhub.version_info >= (0, 8):
            warnings.warn(
                "MarathonSpawner.hub_port_connect is no longer needed with JupyterHub 0.8."
                "  Use JupyterHub.hub_connect_port instead.",
                DeprecationWarning,
            )

    format_volume_name = Any(
        help="""Any callable that accepts a string template and a Spawner
        instance as parameters in that order and returns a string.
        """).tag(config=True)

    @default('format_volume_name')
    def _get_default_format_volume_name(self):
        return default_format_volume_name

    # fix default port to 8888, used in the container
    @default('port')
    def _port_default(self):
        return 8888

    # default to listening on all-interfaces in the container
    @default('ip')
    def _ip_default(self):
        return '0.0.0.0'

    _executor = None

    @property
    def executor(self):
        cls = self.__class__
        if cls._executor is None:
            cls._executor = ThreadPoolExecutor(1)
        return cls._executor

    def __init__(self, *args, **kwargs):
        super(MarathonSpawner, self).__init__(*args, **kwargs)
        self.marathon = MarathonClient(self.marathon_host)

    @property
    def container_name(self):
        return '/%s/%s' % (self.app_prefix, self.user.name)

    def get_state(self):
        state = super(MarathonSpawner, self).get_state()
        state['container_name'] = self.container_name
        return state

    def load_state(self, state):
        if 'container_name' in state:
            pass

    def get_health_checks(self):
        health_checks = []
        health_checks.append(
            MarathonHealthCheck(protocol='TCP',
                                port_index=0,
                                grace_period_seconds=300,
                                interval_seconds=30,
                                timeout_seconds=20,
                                max_consecutive_failures=0))
        return health_checks

    def get_volumes(self):
        volumes = []
        for v in self.volumes:
            mv = MarathonContainerVolume.from_json(v)
            mv.container_path = self.format_volume_name(
                mv.container_path, self)
            mv.host_path = self.format_volume_name(mv.host_path, self)
            if mv.external and 'name' in mv.external:
                mv.external['name'] = self.format_volume_name(
                    mv.external['name'], self)
            volumes.append(mv)
        return volumes

    def get_port_mappings(self):
        port_mappings = []
        for p in self.ports:
            port_mappings.append(
                MarathonContainerPortMapping(container_port=p,
                                             host_port=0,
                                             protocol='tcp'))
        return port_mappings

    def get_constraints(self):
        constraints = []
        for c in self.marathon_constraints:
            constraints.append(MarathonConstraint.from_json(c))
        return constraints

    @run_on_executor
    def get_deployment(self, deployment_id):
        deployments = self.marathon.list_deployments()
        for d in deployments:
            if d.id == deployment_id:
                return d
        return None

    @run_on_executor
    def get_deployment_for_app(self, app_name):
        deployments = self.marathon.list_deployments()
        for d in deployments:
            if app_name in d.affected_apps:
                return d
        return None

    def get_ip_and_port(self, app_info):
        assert len(app_info.tasks) == 1
        ip = socket.gethostbyname(app_info.tasks[0].host)
        return (ip, app_info.tasks[0].ports[0])

    @run_on_executor
    def get_app_info(self, app_name):
        try:
            app = self.marathon.get_app(app_name, embed_tasks=True)
        except NotFoundError:
            self.log.info("The %s application has not been started yet",
                          app_name)
            return None
        else:
            return app

    def _public_hub_api_url(self):
        uri = urlparse(self.hub.api_url)
        port = self.hub_port_connect if self.hub_port_connect > 0 else uri.port
        ip = self.hub_ip_connect if self.hub_ip_connect else uri.hostname
        return urlunparse((uri.scheme, '%s:%s' % (ip, port), uri.path,
                           uri.params, uri.query, uri.fragment))

    def get_args(self):
        args = super().get_args()
        if self.hub_ip_connect:
            # JupyterHub 0.7 specifies --hub-api-url
            # on the command-line, which is hard to update
            for idx, arg in enumerate(list(args)):
                if arg.startswith('--hub-api-url='):
                    args.pop(idx)
                    break
            args.append('--hub-api-url=%s' % self._public_hub_api_url())
        return args

    @gen.coroutine
    def start(self):
        docker_container = MarathonDockerContainer(
            image=self.app_image,
            network=self.network_mode,
            port_mappings=self.get_port_mappings())

        app_container = MarathonContainer(docker=docker_container,
                                          type='DOCKER',
                                          volumes=self.get_volumes())

        # the memory request in marathon is in MiB
        if hasattr(self, 'mem_limit') and self.mem_limit is not None:
            mem_request = self.mem_limit / 1024.0 / 1024.0
        else:
            mem_request = 1024.0

        cmd = self.cmd + self.get_args()
        app_request = MarathonApp(id=self.container_name,
                                  cmd=' '.join(cmd),
                                  env=self.get_env(),
                                  cpus=self.cpu_limit,
                                  mem=mem_request,
                                  container=app_container,
                                  constraints=self.get_constraints(),
                                  health_checks=self.get_health_checks(),
                                  instances=1,
                                  accepted_resource_roles=['*'])

        self.log.info("Creating App: %s", app_request)
        self.log.info("self.marathon: %s", self.marathon)
        app = self.marathon.create_app(self.container_name, app_request)
        if app is False or app.deployments is None:
            self.log.error("Failed to create application for %s",
                           self.container_name)
            self.log.error("app: %s", app)
            return None

        while True:
            app_info = yield self.get_app_info(self.container_name)
            if app_info and app_info.tasks_healthy == 1:
                ip, port = self.get_ip_and_port(app_info)
                break
            yield gen.sleep(1)
        return (ip, port)

    @gen.coroutine
    def stop(self, now=False):
        try:
            status = self.marathon.delete_app(self.container_name)
        except:
            self.log.error("Could not delete application %s",
                           self.container_name)
            raise
        else:
            if not now:
                while True:
                    deployment = yield self.get_deployment(
                        status['deploymentId'])
                    if deployment is None:
                        break
                    yield gen.sleep(1)

    @gen.coroutine
    def poll(self):
        deployment = yield self.get_deployment_for_app(self.container_name)
        if deployment:
            for current_action in deployment.current_actions:
                if current_action.action == 'StopApplication':
                    self.log.error("Application %s is shutting down",
                                   self.container_name)
                    return 1
            return None

        app_info = yield self.get_app_info(self.container_name)
        if app_info and app_info.tasks_healthy == 1:
            return None
        return 0
class HealthCheckBencher(object):
    def __init__(self, marathon_url, image, tasks):
        self.concurrency = 20
        self.docker_image = image
        self.app_base_name = 'health-check-test-'
        self.total_tasks_cout = int(tasks)
        self.instances_per_app = 50
        if tasks < self.instances_per_app:
            self.instances_per_app = self.total_tasks_cout
            self.app_count = 1
        else:
            self.app_count = self.total_tasks_cout/self.instances_per_app
        self.heath_check_interval = 30
        self.test_duration = 20
        self.marathon_cluster = MarathonClient(marathon_url, timeout=240)
        self.work_queue = Queue()
        self.result_queue = Queue()
        self.app_list_queue = Queue()
        self.action_list = [self.start_collect,
                            'sleep={}'.format(self.test_duration),
                            self.get_stats]

    def remove_apps(self):
        apps = self.marathon_cluster.list_apps()
        for app in apps:
            if app.id.startswith("/"+self.app_base_name):
                self.marathon_cluster.delete_app(app.id)
        active = 0
        while True:
            apps = self.marathon_cluster.list_apps()
            for app in apps:
                if app.id.startswith(self.app_base_name):
                    active += 1
            if active == 0:
                break

    def create_app(self, id):
        port_mapping = MarathonContainerPortMapping(container_port=80,
                                                    protocol="tcp")
        app_docker = MarathonDockerContainer(
            image=self.docker_image,
            network="BRIDGE",
            force_pull_image=True,
            port_mappings=[port_mapping])
        app_container = MarathonContainer(docker=app_docker)
        http_health_check = MarathonHealthCheck(
            protocol="HTTP",
            path="/status",
            grace_period_seconds=300,
            interval_seconds=self.heath_check_interval,
            timeout_seconds=20,
            max_consecutive_failures=0
        )

        app_suffix = str(md5(str(random())).hexdigest())
        app_name = self.app_base_name + app_suffix
        new_app = MarathonApp(cpus=CPUS, mem=MEM, disk=DISK,
                              container=app_container,
                              health_checks=[http_health_check],
                              instances=self.instances_per_app,
                              max_launch_delay_seconds=5)
        print("Creating {}".format(app_name))
        self.marathon_cluster.create_app(app_id=app_name, app=new_app)
        self.app_list_queue.put(app_name)
        return None

    def wait_instances(self, app_name):
        health_ok = 0
        while health_ok < self.instances_per_app:
            health_ok = 0
            tasks = self.marathon_cluster.list_tasks(app_name)
            for task in tasks:
                if task.health_check_results:
                    health_ok += 1

    def start_collect(self, task):
        url = 'http://'+task['host']+':'+str(task['port'])+'/start_collect'
        res = urlopen(url)
        if res.getcode() == 200:
            print(task['id']+': collecter was started')
        else:
            print(task['id']+': failed to start collecter')

    def stop_collect(self, task):
        url = 'http://'+task['host']+':'+str(task['port'])+'/stop_collect'
        res = urlopen(url)
        if res.getcode() == 200:
            print(task['id']+': collecter was stopped')
        else:
            print(task['id']+': failed to stop collecter')

    def clear_stats(self, task):
        url = 'http://'+task['host']+':'+str(task['port'])+'/clear_stats'
        res = urlopen(url)
        if res.getcode() == 200:
            print(task['id']+': stats was dropped')
        else:
            print(task['id']+': stats was dropped')

    def get_stats(self, task):
        url = 'http://'+task['host']+':'+str(task['port'])+'/get_timestamps'
        try:
            res = urlopen(url)
        except Exception:
            print("URL req failed")
            self.result_queue.put({'id': task['id'],
                                   'status': 'Failed',
                                   'data': []})
            return
        if res.getcode() == 200:
            data = res.read()
            timestamps = data.split(',')
            self.result_queue.put({'id': task['id'],
                                   'status': 'ok',
                                   'data': timestamps})
        elif res.getcode() == 202:
            print("Collecting is not enabled")
            self.result_queue.put({'id': task['id'],
                                   'status': 'Collecting is not enabled',
                                   'data': []})
        else:
            print("Unknown response code")
            self.result_queue.put({'id': task['id'],
                                   'status': 'Unknown response code',
                                   'data': []})

    def repeat(self, action):
        while self.work_queue.empty() is False:
            try:
                iteration = self.work_queue.get_nowait()
            except Empty:
                continue
            action(iteration)
            self.work_queue.task_done()

    def fill_queue(self, iterations):
        for iteration in iterations:
            self.work_queue.put(iteration)

    def get_tasks(self):
        res = []
        tasks = self.marathon_cluster.list_tasks()
        for task in tasks:
            if not task.id.startswith('health-check-test-'):
                continue
            res.append({'id': str(task.id),
                        'host': str(task.host),
                        'port': str(task.ports[0])})
        return res

    def create_apps(self):
        self.fill_queue(range(self.app_count))
        for thread_num in range(self.concurrency):
            if self.work_queue.empty() is True:
                break
            worker = Thread(target=self.repeat, args=(self.create_app,))
            worker.start()
        self.work_queue.join()

        while self.app_list_queue.empty() is False:
            try:
                app_name = self.app_list_queue.get_nowait()
            except Empty:
                continue
            self.work_queue.put(app_name)

        for thread_num in range(self.concurrency):
            if self.work_queue.empty() is True:
                break
            worker = Thread(target=self.repeat, args=(self.wait_instances,))
            worker.start()
        self.work_queue.join()

    def start_test(self):
        task_list = self.get_tasks()
        for action in self.action_list:
            if isinstance(action, six.text_type):
                if action.startswith('sleep='):
                    amount = int(action.split('=')[1])
                    sleep(60*amount)
                continue
            self.fill_queue(task_list)
            for thread_num in range(self.concurrency):
                if self.work_queue.empty() is True:
                    break
                worker = Thread(target=self.repeat, args=(action,))
                worker.start()
            self.work_queue.join()

    def generate_report(self):
        today = datetime.today()
        file_prefix = "{:%Y-%m-%d_%H_%M_%S-}".format(today)
        file_name = (file_prefix +
                     'health_check_result-' +
                     str(self.total_tasks_cout) +
                     'tasks.csv')

        f = open(file_name, "w")
        f.write("Task ID,Health check timestamp")

        while self.result_queue.empty() is False:
            try:
                result = self.result_queue.get_nowait()
            except Empty:
                continue
            for timestamp in result['data']:
                f.write("\n%s,%s" % (result['id'], timestamp))

        f.close()
Example #24
0
    try:
        logging.info("Connecting to Marathon...")
        client = MarathonClient(marathon_urls,
                                username=marathon_user,
                                password=marathon_password,
                                verify=False)
    except MarathonError as e:
        logging.error("Failed to connect to Marathon! {}".format(e))
        exit_code = 1
        sys.exit(exit_code)

    logging.info("Deploying application...")
    try:
        app = client.get_app(marathon_app_id)
    except MarathonHttpError:
        response = client.create_app(marathon_app_id, app_definition)
        version = response.version
        depolyment_id = response.deployments[0].id
    else:
        response = client.update_app(marathon_app_id,
                                     app_definition,
                                     force=marathon_force)
        version = response['version']
        deployment_id = response['deploymentId']

    logging.info("New version deployed: {}".format(version))

    if app_definition.instances == 0:
        logging.info(
            "Deactivated application by setting instances to 0, deployment complete."
        )
import time
from optparse import OptionParser
from marathon import MarathonClient
from marathon.models import MarathonApp

if __name__ == '__main__':
    usage = ('python %prog')
    parser = OptionParser(description='Simple marathon-python based master to launch apps',
                          version="0.1 ", usage=usage)
    (options, args) = parser.parse_args()
    if (len(args) != 0):
        parser.print_help()
        sys.exit(2)


    print "Initiating marathonclient..."
    c = MarathonClient('http://localhost:8080')
    app_cmd = "python /home/abdullah/cosmic-space/test-mesos/py-zmq/sub_client.py --server_ip_ports 10.10.0.2:5556"

    # launch app
    print "Initiating zmq-client app"
    c.create_app('zmq-client', MarathonApp(cmd=app_cmd, mem=16, cpus=0.01))

    # scale
    raw_input("scale_apps upto 400")
    c.scale_app('zmq-client', instances=400)

    # delete
    raw_input("delete apps")
    c.delete_app('zmq-client')
class MarathonSpawner(Spawner):
    # Load the app image
    app_image = Unicode("jupyterhub/singleuser", config=True)

    # The command to run 
    app_cmd = Unicode("jupyter notebook", config=True)

    # This is the prefix in Marathon
    app_prefix = Unicode(
        "jupyter",
        help=dedent(
            """
            Prefix for app names. The full app name for a particular
            user will be <prefix>/<username>.
            """
        )
    ).tag(config=True)

    user_web_port = Integer(0, help="Port that the Notebook is listening on").tag(config=True)
    user_ssh_port = Integer(0, help="SSH Port that the container is listening on").tag(config=True)
    user_ssh_host = Unicode('', help="Hostname of the ssh container").tag(config=True)

    use_jupyterlab = Integer(0, help="Use Jupyterlab - Jupyterlab is 1 default is 0 or Jupyternotebook").tag(config=True)

    user_ssh_hagroup = Unicode('', help="HAProxy group for ssh container port").tag(config=True)

    # zeta_user_file are the users and their custom settings for installation in Zeta Architechure. If this is blank, defaults from Jupyter Hub are used for Mem, CPU, Ports, Image. If this is not blank, we will read from that file
    zeta_user_file = Unicode(
    "",
    help="Path to json file that includes users and per user settings"
    ).tag(config=True)


    no_user_file_fail = Bool(
    True,
    help="Is zeta_user_file is provided, but can't be opened fail. (Default). False loads defaults and tries to spawn"
    ).tag(config=True)

    # Marathon Server
    marathon_host = Unicode(
        u'',
        help="Hostname of Marathon server").tag(config=True)

    marathon_user_name = Unicode(
        u'',
        help='Marathon user name'
    ).tag(config=True)

    marathon_user_password = Unicode(
        u'',
        help='Marathon user password'
    ).tag(config=True)

    fetch = List([], help='Optional files to fetch').tag(config=True)

    custom_env = List(
        [],
        help='Additional ENVs to add to the default. Format is a list of 1 record dictionary. [{key:val}]'
       ).tag(config=True)

    # Constraints in Marathon
    marathon_constraints = List(
        [],
        help='Constraints to be passed through to Marathon').tag(config=True)

    # Shared Notebook location
    shared_notebook_dir = Unicode(
    '', help="Shared Notebook location that users will get a link to in their notebook location - can be blank"
    ).tag(config=True)

    ports = List(
        [8888],
        help='Ports to expose externally'
        ).tag(config=True)

    volumes = List(
        [],
        help=dedent(
            """
            A list in Marathon REST API format for mounting volumes into the docker container.
            [
                {
                    "containerPath": "/foo",
                    "hostPath": "/bar",
                    "mode": "RW"
                }
            ]

            Note that using the template variable {username} in containerPath,
            hostPath or the name variable in case it's an external drive
            it will be replaced with the current user's name.
            """
        )
    ).tag(config=True)

    network_mode = Unicode(
        'BRIDGE',
        help="Enum of BRIDGE or HOST"
        ).tag(config=True)

    hub_ip_connect = Unicode(
        "",
        help="Public IP address of the hub"
        ).tag(config=True)

    hub_port_connect = Integer(
        -1,
        help="Public PORT of the hub"
        ).tag(config=True)

    format_volume_name = Any(
        help="""Any callable that accepts a string template and a Spawner
        instance as parameters in that order and returns a string.
        """
    ).tag(config=True)

    @default('format_volume_name')
    def _get_default_format_volume_name(self):
        return default_format_volume_name

    _executor = None
    @property
    def executor(self):
        cls = self.__class__
        if cls._executor is None:
            cls._executor = ThreadPoolExecutor(1)
        return cls._executor

    def __init__(self, *args, **kwargs):
        super(MarathonSpawner, self).__init__(*args, **kwargs)
        self.marathon = MarathonClient(self.marathon_host,
                                       self.marathon_user_name,
                                       self.marathon_user_password)

    @property
    def container_name(self):
        self.log.info("Container Name : %s / %s / %s",self.app_prefix, self.user.name, self.name )
        try:
            self.log.info("Debug %s", json.dumps(self.name))
        except:
            self.log.info("Could not log self")
        return '/%s/%s%s' % (self.app_prefix, self.user.name, self.name)

    def get_state(self):
        state = super(MarathonSpawner, self).get_state()
        state['container_name'] = self.container_name
        return state

    def load_state(self, state):
        if 'container_name' in state:
            pass

    def get_health_checks(self):
        health_checks = []
        health_checks.append(MarathonHealthCheck(
            protocol='TCP',
            port_index=0,
            grace_period_seconds=300,
            interval_seconds=60,
            timeout_seconds=20,
            max_consecutive_failures=0
            ))

        return health_checks

    def get_volumes(self):
        volumes = []
        for v in self.volumes:
            mv = MarathonContainerVolume.from_json(v)
            mv.container_path = self.format_volume_name(mv.container_path, self)
            mv.host_path = self.format_volume_name(mv.host_path, self)
            if mv.external and 'name' in mv.external:
                mv.external['name'] = self.format_volume_name(mv.external['name'], self)
            volumes.append(mv)
        out_vols = []
        dups = {}
        #Remove Duplicates there should be only one container path point for container
        for x in volumes:
            if x.container_path in dups:
                pass
            else:
                out_vols.append(x)
                dups[x.container_path] = 1

        return out_vols

    def get_app_cmd(self):
        retval = self.app_cmd.replace("{username}", self.user.name)
        retval = retval.replace("{userwebport}", str(self.user_web_port))
        if self.use_jupyterlab == 1:
            print("This is where I should do some thing if I want to run Jupyter lab")

        if self.user_ssh_hagroup != "":
            retval = retval.replace("{usersshport}", "$PORT0")
        else:
            retval = retval.replace("{usersshport}", str(self.user_ssh_port))
        return retval


    def get_port_mappings(self):
        port_mappings = []
        if self.network_mode == "BRIDGE":
            for p in self.ports:
                port_mappings.append(
                    MarathonContainerPortMapping(
                        container_port=p,
                        host_port=0,
                        protocol='tcp'
                    )
                )
        return port_mappings

    def get_constraints(self):
        constraints = []
        for c in self.marathon_constraints:
            constraints.append(MarathonConstraint.from_json(c))

    @run_on_executor
    def get_deployment(self, deployment_id):
        deployments = self.marathon.list_deployments()
        for d in deployments:
            if d.id == deployment_id:
                return d
        return None

    @run_on_executor
    def get_deployment_for_app(self, app_name):
        deployments = self.marathon.list_deployments()
        for d in deployments:
            if app_name in d.affected_apps:
                return d
        return None

    def get_ip_and_port(self, app_info):
        assert len(app_info.tasks) == 1
        ip = socket.gethostbyname(app_info.tasks[0].host)
        port = app_info.tasks[0].ports[0]
        return (ip, port)

    @run_on_executor
    def get_app_info(self, app_name):
        try:
            app = self.marathon.get_app(app_name, embed_tasks=True)
        except NotFoundError:
            self.log.info("The %s application has not been started yet", app_name)
            return None
        else:
            return app

    def _public_hub_api_url(self):
        uri = urlparse(self.hub.api_url)
        port = self.hub_port_connect if self.hub_port_connect > 0 else uri.port
        ip = self.hub_ip_connect if self.hub_ip_connect else uri.hostname
        return urlunparse((
            uri.scheme,
            '%s:%s' % (ip, port),
            uri.path,
            uri.params,
            uri.query,
            uri.fragment
            ))

    def get_env(self):
        env = super(MarathonSpawner, self).get_env()
        env.update(dict(
            # Jupyter Hub config
            JPY_USER=self.user.name,
            # JPY_COOKIE_NAME=self.user.server.cookie_name,
            # JPY_BASE_URL=self.user.server.base_url,
            JPY_HUB_PREFIX=self.hub.server.base_url,
            JPY_USER_WEB_PORT=str(self.user_web_port),
            JPY_USER_SSH_PORT=str(self.user_ssh_port),
            JPY_USER_SSH_HOST=str(self.user_ssh_host)
        ))

        if self.notebook_dir:
            env['NOTEBOOK_DIR'] = self.notebook_dir

        if self.hub_ip_connect or self.hub_port_connect > 0:
            hub_api_url = self._public_hub_api_url()
        else:
            hub_api_url = self.hub.api_url
        env['JPY_HUB_API_URL'] = hub_api_url

        for x in self.custom_env:
            for k,v in x.items():
                env[k] = str(v)



        return env

    def update_users(self):
        # No changes if the zeta_user_file is blank
        if self.zeta_user_file != "":
            try:
                j = open(self.zeta_user_file, "r")
                user_file = j.read()
                j.close()
                user_ar = {}
                for x in user_file.split("\n"):
                    if x.strip().find("#") != 0 and x.strip() != "":
                        y = json.loads(x)
                        if y['user'] == self.user.name:
                            user_ar = y
                            break
                if len(user_ar) == 0:
                    self.log.error("Could not find current user %s in zeta_user_file %s - Not Spawning"  % (self.user.name, self.zeta_user_file))
                    if self.no_user_file_fail == True:
                        raise Exception('no_user_file_fail is True, will not go on')

                print("User List identified and loaded, setting values to %s" % user_ar)
                self.cpu_limit = user_ar['cpu_limit']
                self.mem_limit = user_ar['mem_limit']
                self.user_ssh_port = user_ar['user_ssh_port']
                self.user_web_port = user_ar['user_web_port']
                self.user_ssh_host = user_ar['user_ssh_host']
                try:
                    self.user_ssh_hagroup = user_ar['user_ssh_hagroup']
                except:
                    self.user_ssh_hagroup = ""

                try:
                    self.use_jupyterlab = int(user_ar['use_jupyterlab'])
                except:
                    self.use_jupyterlab = 0

                self.network_mode = user_ar['network_mode']
                self.app_image = user_ar['app_image']
                self.marathon_constraints = user_ar['marathon_constraints']
                self.ports.append(self.user_web_port)
                self.ports.append(self.user_ssh_port)
                self.custom_env = self.custom_env + user_ar['custom_env']
                self.volumes = self.volumes + user_ar['volumes']
                print("User List Loaded!")

            # { "user": "******", "cpu_limit": "1", "mem_limit": "2G", "user_ssh_port": 10500, "user_web_port:" 10400, "network_mode": "BRIDGE", "app_image": "$APP_IMG", "marathon_constraints": []}

            except:
                self.log.error("Could not find or open zeta_user_file: %s" % self.zeta_user_file)
                if self.no_user_file_fail == True:
                    raise Exception("Could not open file and config says don't go on")

    @gen.coroutine
    def start(self):
        # First make a quick call to determine if user info was updated
        self.update_users()
        # Go on to start the notebook
        docker_container = MarathonDockerContainer(
            image=self.app_image,
            network=self.network_mode,
            port_mappings=self.get_port_mappings())

        app_container = MarathonContainer(
            docker=docker_container,
            type='DOCKER',
            volumes=self.get_volumes())

        # the memory request in marathon is in MiB
        if hasattr(self, 'mem_limit') and self.mem_limit is not None:
            mem_request = self.mem_limit / 1024.0 / 1024.0
        else:
            mem_request = 1024.0

        if self.user_ssh_hagroup != "":
            myports = [self.user_ssh_port]
            labels = {"HAPROXY_GROUP": self.user_ssh_hagroup, "HA_EDGE_CONF": "1"}
        else:
            labels = {}
            myports = []

        app_request = MarathonApp(
            id=self.container_name,
            cmd=self.get_app_cmd(),
            env=self.get_env(),
            cpus=self.cpu_limit,
            mem=mem_request,
            container=app_container,
            constraints=self.get_constraints(),
            health_checks=self.get_health_checks(),
            instances=1,
            labels=labels,
            ports=myports,
            fetch=self.fetch,
            )

        app = self.marathon.create_app(self.container_name, app_request)
        if app is False or app.deployments is None:
            self.log.error("Failed to create application for %s", self.container_name)
            return None

        while True:
            app_info = yield self.get_app_info(self.container_name)
            if app_info and app_info.tasks_healthy == 1:
                ip, port = self.get_ip_and_port(app_info)
                break
            yield gen.sleep(1)
        return (ip, port)

    @gen.coroutine
    def stop(self, now=False):
        try:
            status = self.marathon.delete_app(self.container_name)
        except:
            self.log.error("Could not delete application %s", self.container_name)
            raise
        else:
            if not now:
                while True:
                    deployment = yield self.get_deployment(status['deploymentId'])
                    if deployment is None:
                        break
                    yield gen.sleep(1)

    @gen.coroutine
    def poll(self):
        deployment = yield self.get_deployment_for_app(self.container_name)
        if deployment:
            for current_action in deployment.current_actions:
                if current_action.action == 'StopApplication':
                    self.log.error("Application %s is shutting down", self.container_name)
                    return 1
            return None

        app_info = yield self.get_app_info(self.container_name)
        if app_info and app_info.tasks_healthy == 1:
            return None
        return 0
Example #27
0
class MarathonHTTPClient(AbstractSchedulerClient):
    def __init__(self, target, auth, options, pkey):
        super(MarathonHTTPClient, self).__init__(target, auth, options, pkey)
        self.target = settings.MARATHON_HOST
        self.registry = settings.REGISTRY_HOST + ":" + settings.REGISTRY_PORT
        self.client = MarathonClient("http://" + self.target + ":8180")
        self.fleet = FleetHTTPClient("/var/run/fleet.sock", auth, options, pkey)

    # helpers
    def _app_id(self, name):
        return name.replace("_", ".")

    # container api
    def create(self, name, image, command="", **kwargs):
        """Create a new container"""
        app_id = self._app_id(name)
        l = locals().copy()
        l.update(re.match(MATCH, name).groupdict())
        image = self.registry + "/" + image
        mems = kwargs.get("memory", {}).get(l["c_type"])
        m = 0
        if mems:
            mems = mems.lower()
            if mems[-2:-1].isalpha() and mems[-1].isalpha():
                mems = mems[:-1]
            m = int(mems[:-1])
        c = 0.5
        cpu = kwargs.get("cpu", {}).get(l["c_type"])
        if cpu:
            c = cpu
        cmd = "docker run --name {name} -P {image} {command}".format(**locals())
        self.client.create_app(app_id, MarathonApp(cmd=cmd, mem=m, cpus=c, instances=0))
        for _ in xrange(POLL_ATTEMPTS):
            if self.client.get_app(self._app_id(name)).tasks_running == 0:
                return
            time.sleep(1)

    def start(self, name):
        """Start a container."""
        self.client.scale_app(self._app_id(name), 1, force=True)
        for _ in xrange(POLL_ATTEMPTS):
            if self.client.get_app(self._app_id(name)).tasks_running == 1:
                break
            time.sleep(1)
        host = self.client.get_app(self._app_id(name)).tasks[0].host
        self._waitforcontainer(host, name)

    def destroy(self, name):
        """Destroy a container."""
        try:
            host = self.client.get_app(self._app_id(name)).tasks[0].host
            self.client.delete_app(self._app_id(name), force=True)
            self._delete_container(host, name)
        except:
            self.client.delete_app(self._app_id(name), force=True)

    def _get_container_state(self, host, name):
        docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version="1.17")
        try:
            if docker_cli.inspect_container(name)["State"]["Running"]:
                return JobState.up
        except:
            return JobState.destroyed

    def _waitforcontainer(self, host, name):
        for _ in xrange(POLL_WAIT):
            if self._get_container_state(host, name) == JobState.up:
                return
            time.sleep(1)
        raise RuntimeError("App container Not Started")

    def _delete_container(self, host, name):
        docker_cli = Client("tcp://{}:2375".format(host), timeout=1200, version="1.17")
        if docker_cli.inspect_container(name)["State"]:
            docker_cli.remove_container(name, force=True)

    def run(self, name, image, entrypoint, command):  # noqa
        """Run a one-off command."""
        return self.fleet.run(name, image, entrypoint, command)

    def state(self, name):
        """Display the given job's running state."""
        try:
            for _ in xrange(POLL_ATTEMPTS):
                if self.client.get_app(self._app_id(name)).tasks_running == 1:
                    return JobState.up
                elif self.client.get_app(self._app_id(name)).tasks_running == 0:
                    return JobState.created
                time.sleep(1)
        except:
            return JobState.destroyed
class MarathonSpawner(Spawner):

    app_image = Unicode("jupyterhub/singleuser:%s" % _jupyterhub_xy,
                        config=True)

    app_prefix = Unicode("jupyter",
                         help=dedent("""
            Prefix for app names. The full app name for a particular
            user will be <prefix>/<username>/notebook.
            """)).tag(config=True)

    marathon_host = Unicode(
        u'', help="Hostname of Marathon server").tag(config=True)

    marathon_constraints = List(
        [],
        help='Constraints to be passed through to Marathon').tag(config=True)

    unreachable_strategy = Any(
        None,
        help='Unreachable strategy to be passed through to Marathon').tag(
            config=True)

    volumes = List([],
                   help=dedent("""
            A list in Marathon REST API format for mounting volumes into the docker container.
            [
                {
                    "containerPath": "/foo",
                    "hostPath": "/bar",
                    "mode": "RW"
                }
            ]

            Note that using the template variable {username} in containerPath,
            hostPath or the name variable in case it's an external drive
            it will be replaced with the current user's name.
            """)).tag(config=True)

    max_cpu = Float(2, config=True)
    cpu = Float(1, config=True)

    max_mem = Float(4096, config=True)
    mem = Float(1024, config=True)

    max_disk = Float(20000, config=True)
    disk = Float(5000, config=True)

    max_gpu = Integer(0, config=True)
    gpu = Integer(0, config=True)

    mesos_user = Unicode(None, config=True, allow_none=True)

    autotimeout = Integer(
        None,
        help="Seconds to automatically timeout unused notebook servers",
        config=True,
        allow_none=True)

    hub_ip_connect = Unicode(
        "", help="Public IP address of the hub").tag(config=True)

    @observe('hub_ip_connect')
    def _ip_connect_changed(self, change):
        if jupyterhub.version_info >= (0, 8):
            warnings.warn(
                "MarathonSpawner.hub_ip_connect is no longer needed with JupyterHub 0.8."
                "  Use JupyterHub.hub_connect_ip instead.",
                DeprecationWarning,
            )

    hub_port_connect = Integer(-1,
                               help="Public PORT of the hub").tag(config=True)

    @observe('hub_port_connect')
    def _port_connect_changed(self, change):
        if jupyterhub.version_info >= (0, 8):
            warnings.warn(
                "MarathonSpawner.hub_port_connect is no longer needed with JupyterHub 0.8."
                "  Use JupyterHub.hub_connect_port instead.",
                DeprecationWarning,
            )

    format_volume_name = Any(
        help="""Any callable that accepts a string template and a Spawner
        instance as parameters in that order and returns a string.
        """).tag(config=True)

    @default('format_volume_name')
    def _get_default_format_volume_name(self):
        return default_format_volume_name

    # fix default port to 8888, used in the container
    @default('port')
    def _port_default(self):
        return 8888

    # default to listening on all-interfaces in the container
    @default('ip')
    def _ip_default(self):
        return '0.0.0.0'

    _executor = None

    @property
    def executor(self):
        cls = self.__class__
        if cls._executor is None:
            cls._executor = ThreadPoolExecutor(5)
        return cls._executor

    def __init__(self, *args, **kwargs):
        super(MarathonSpawner, self).__init__(*args, **kwargs)
        self.marathon = MarathonClient(self.marathon_host)
        self.get_state()

    @property
    def app_id(self):
        return '/%s/%s/notebook' % (self.app_prefix, self.user.name)

    def get_state(self):
        state = super(MarathonSpawner, self).get_state()
        state['user_options'] = self.stored_user_options = self.user_options
        return state

    def load_state(self, state):
        super(MarathonSpawner, self).load_state(state)
        self.stored_user_options = state.get('user_options', {})

    def get_health_checks(self):
        health_checks = []
        health_checks.append(
            MarathonHealthCheck(protocol='TCP',
                                port_index=0,
                                grace_period_seconds=300,
                                interval_seconds=30,
                                timeout_seconds=20,
                                max_consecutive_failures=0))
        return health_checks

    def get_volumes(self):
        volumes = []
        for v in self.volumes:
            mv = MarathonContainerVolume.from_json(v)
            mv.container_path = self.format_volume_name(
                mv.container_path, self)
            mv.host_path = self.format_volume_name(mv.host_path, self)
            if mv.external and 'name' in mv.external:
                mv.external['name'] = self.format_volume_name(
                    mv.external['name'], self)
            volumes.append(mv)
        return volumes

    def get_constraints(self):
        constraints = []
        for c in self.marathon_constraints:
            constraints.append(MarathonConstraint.from_json(c))
        return constraints

    def get_ip_and_port(self, app_info):
        assert len(app_info.tasks) == 1
        ip = socket.gethostbyname(app_info.tasks[0].host)
        return (ip, app_info.tasks[0].ports[0])

    @run_on_executor
    def get_app_info(self, app_id):
        try:
            app = self.marathon.get_app(app_id, embed_tasks=True)
        except NotFoundError:
            self.log.info("The %s application has not been started yet",
                          app_id)
            return None
        else:
            return app

    def _public_hub_api_url(self):
        uri = urlparse(self.hub.api_url)
        port = self.hub_port_connect if self.hub_port_connect > 0 else uri.port
        ip = self.hub_ip_connect if self.hub_ip_connect else uri.hostname
        return urlunparse((uri.scheme, '%s:%s' % (ip, port), uri.path,
                           uri.params, uri.query, uri.fragment))

    def get_args(self):
        args = super().get_args()
        if self.hub_ip_connect:
            # JupyterHub 0.7 specifies --hub-api-url
            # on the command-line, which is hard to update
            for idx, arg in enumerate(list(args)):
                if arg.startswith('--hub-api-url='):
                    args.pop(idx)
                    break
            args.append('--hub-api-url=%s' % self._public_hub_api_url())
        for idx, arg in enumerate(list(args)):
            if arg.startswith('--port='):
                args.pop(idx)
                break
        args.append('--port=$PORT0')
        return args

    def options_from_form(self, formdata):
        options = {}
        options['app_image'] = formdata['app_image'][0] or None
        if 'force_pull_image' in formdata:
            options['force_pull_image'] = formdata['force_pull_image'][
                0] == 'on'
        options['cpu'] = float(formdata['cpu'][0])
        options['mem'] = float(formdata['mem'][0])
        options['disk'] = float(formdata['disk'][0])
        if formdata.get('gpu', None):
            options['gpu'] = int(formdata['gpu'][0])
        return options

    @property
    def options_form(self):
        template = """
        <div class="form-group">
            <label for="app_image">Image <span class="label label-default">Optional</span></label>
            <input id="app_image" class="form-control" name="app_image" type="text" placeholder="e.g. %(default_app_image)s" value="%(app_image)s" />
        </div>
        <div class="checkbox">
            <label for="force_pull_image">
                <input id="force_pull_image" name="force_pull_image" type="checkbox" value="on" />
                Force pull image
            </label>
        </div>
        <div class="form-group">
            <div class="row">
                <div class="col-sm-4">
                    <label for="cpu">CPU</label>
                    <input id="cpu" class="form-control" name="cpu" type="number" step="any" value="%(cpu)s" min="%(min_cpu)s" max="%(max_cpu)s" required />
                </div>
                <div class="col-sm-4">
                    <label for="mem">Mem (MiB)</label>
                    <input id="mem" class="form-control" name="mem" type="number" step="any" value="%(mem)s" min="%(min_mem)s" max="$(max_mem)s" required />
                </div>
                <div class="col-sm-4">
                    <label for="disk">Disk (MiB)</label>
                    <input id="disk" class="form-control" name="disk" type="number" step="any" value="%(disk)s" min="%(min_disk)s" max="%(max_disk)s" required />
                </div>
            </div>
        </div>
        """ % {
            'default_app_image':
            self.app_image,
            'app_image':
            self.stored_user_options.get('app_image', None) or '',
            'min_cpu':
            0.001,
            'max_cpu':
            self.max_cpu,
            'cpu':
            remove_zeros(str(self.stored_user_options.get('cpu', self.cpu))),
            'min_mem':
            32,
            'max_mem':
            self.max_mem,
            'mem':
            remove_zeros(str(self.stored_user_options.get('mem', self.mem))),
            'min_disk':
            1000,
            'max_disk':
            self.max_disk,
            'disk':
            remove_zeros(str(self.stored_user_options.get('disk', self.disk))),
        }
        if self.max_gpu > 0:
            template += """
            <div class="form-group">
                <div class="row">
                    <div class="col-sm-4">
                        <label for="gpu">GPU</label>
                        <input id="gpu" class="form-control" name="gpu" type="number" step="1" value="%(gpu)s" min="%(min_gpu)s" max="%(max_gpu)s" required />
                    </div>
                </div>
            </div>
            """ % {
                'min_gpu': 0,
                'max_gpu': self.max_gpu,
                'gpu': self.stored_user_options.get('gpu', self.gpu),
            }
        return """<div>%s</div>""" % template

    @gen.coroutine
    def start(self):
        app_image = self.user_options.get('app_image', None) or self.app_image
        force_pull_image = self.user_options.get('force_pull_image', False)
        self.log.info("starting a Marathon app with image=%s" % app_image)

        container_params = {
            'image': app_image,
            'force_pull_image': force_pull_image
        }
        docker_container = MarathonDockerContainer(**container_params)

        app_container = MarathonContainer(docker=docker_container,
                                          type='MESOS',
                                          volumes=self.get_volumes())

        cpu = self.user_options.get('cpu', None)
        mem = self.user_options.get('mem', None)
        disk = self.user_options.get('disk', None)
        gpu = self.user_options.get('gpu', None)
        self.log.info("resource: (cpu=%s, mem=%s, disk=%s, gpu=%s)" %
                      (cpu, mem, disk, gpu))

        cmd = self.cmd + self.get_args()
        env = self.get_env()

        port_definitions = [PortDefinition(port=0, protocol='tcp')]

        app_request = MarathonApp(
            id=self.app_id,
            cmd=' '.join(
                cmd),  # cmd does not use Docker image's default entrypoint
            env=env,
            cpus=cpu,
            mem=mem,
            disk=disk,
            gpus=gpu,
            user=self.mesos_user,
            container=app_container,
            port_definitions=port_definitions,
            networks=[{
                'mode': 'host'
            }],
            constraints=self.get_constraints(),
            health_checks=self.get_health_checks(),
            unreachable_strategy=self.unreachable_strategy,
            instances=1)

        app_info = self.get_app_info(self.app_id)
        try:
            if app_info:
                self.marathon.update_app(self.app_id, app_request, force=True)
            else:
                self.marathon.create_app(self.app_id, app_request)
        except Exception as e:
            self.log.error("Failed to create application for %s: %s",
                           self.app_id, e)
            raise e

        while True:
            app_info = yield self.get_app_info(self.app_id)
            if app_info is None:
                raise MarathonSpawnerException("Application %s is lost",
                                               self.app_id)
            elif app_info.instances == 0:
                raise MarathonSpawnerException(
                    "No instance for application %s", self.app_id)
            elif app_info.tasks_healthy == 1:
                ip, port = self.get_ip_and_port(app_info)
                break
            yield gen.sleep(1)
        return (ip, port)

    @gen.coroutine
    def stop(self, now=False):
        try:
            self.marathon.update_app(self.app_id,
                                     MarathonApp(instances=0),
                                     force=True)
        except Exception as e:
            self.log.error("Failed to delete application %s", self.app_id)
            raise e
        else:
            if not now:
                while True:
                    app_info = yield self.get_app_info(self.app_id)
                    if app_info is None:
                        # Stopping application is lost, just ignore it!
                        break
                    elif len(app_info.deployments) == 0:
                        # This is the success case.
                        break
                    yield gen.sleep(1)

    @gen.coroutine
    def poll(self):
        app_info = yield self.get_app_info(self.app_id)

        if app_info is None:
            self.log.error("Application %s is lost", self.app_id)
            return 3

        for deployment in app_info.deployments:
            for current_action in deployment.current_actions:
                if current_action.action == 'StopApplication':
                    self.log.error("Application %s is shutting down",
                                   self.app_id)
                    return 1

        if app_info.tasks_healthy == 0:
            self.log.error("No healthy instance for application %s",
                           self.app_id)
            return 2

        if self.autotimeout is not None:
            tm_diff = datetime.utcnow() - self.user.last_activity
            self.log.debug("Application %s is inactive for %d sec",
                           self.app_id, tm_diff.seconds)
            if tm_diff > timedelta(seconds=self.autotimeout):
                self.log.info(
                    "Stopping application %s because it's inactive for more than %d sec",
                    self.app_id, self.autotimeout)
                # Do not yield the result of stop here
                self.stop()
                return 0

        return None
Example #29
0
def deploy(app_definition, marathon_url, instances, auth_token, zero, force):
    old_appids = []
    # Connect to Marathon
    print("\nConnecting to Marathon...")
    c = MarathonClient(marathon_url, auth_token=auth_token)
    print("Connected to", marathon_url)

    # Pick up the Marathon App Definition file
    app_json = open(app_definition).read()
    app = MarathonApp.from_json(json.loads(app_json))
    new_app_id = app.id
    service_name = new_app_id.split("/")[-1].split(".")[0]

    # Instantiate the new application on DC/OS but don't launch it yet
    # The application definition instances field should be 0 by default
    # If forced, the application will be relaunched even if the ID already exists
    print("\nInstantiating new application on Marathon with", app.instances,
          "instances...")
    try:
        c.create_app(new_app_id, app)
    except:
        if force == 'Yes':
            print("\nForcing redeploy of the same app id...", new_app_id)
            c.update_app(new_app_id, app, force=True, minimal=True)
            check_deployment(c, new_app_id)
            pass
        else:
            sys.exit()
    print("Created app", new_app_id)

    # List and find currently running apps of the same service
    # This assumes the naming convention (id): /some/group/service_name.uniquevalue
    print("\nFinding any existing apps for service:", service_name)
    for app in c.list_apps():
        existing_service_name = app.id.split("/")[-1].split(".")[0]
        if (service_name == existing_service_name) and app.instances > 0:
            print("Found up and running application id:", app.id)
            old_appids.append(app.id)

    # If it's the first deployment ever, just launch the desired number of instances
    # Otherwise perform a hybrid release
    # Finally clean up any older app instances running
    if not old_appids:
        if instances is None:
            instances = 2
        print("No current apps found. Launching brand new service with",
              instances, "instances...")
        c.scale_app(new_app_id, instances=instances)
        check_deployment(c, new_app_id)
        check_health(c, new_app_id)

    else:
        old_appids.reverse()
        if zero == 'Yes':
            print("\nStarting zero downtime deployment for...", new_app_id)
            for old_appid in old_appids:
                if instances is None:
                    instances = c.get_app(old_appid).instances
                if (old_appid == '' or old_appid == new_app_id
                        or old_appid == '/' + new_app_id):
                    print("Scaling existing app_id", new_app_id, "to",
                          instances, "instances...")
                    c.scale_app(new_app_id, instances=instances)
                    check_deployment(c, new_app_id)
                    check_health(c, new_app_id)

                else:
                    print("Target number of total instances:", instances)
                    delta = int(round(instances * .50))
                    delta = (delta if delta > 0 else 1)

                    scale(c, new_app_id, old_appid, delta)

                    if (c.get_app(new_app_id).instances != instances):
                        print("\nLaunch", instances - delta,
                              "remaining instance(s) of the new version...")
                        c.scale_app(new_app_id, instances=instances)
                        check_deployment(c, new_app_id)
                        check_health(c, new_app_id)
                    if (c.get_app(old_appid).instances > 0):
                        print(
                            "Finish shutting down remaining instances of the old version..."
                        )
                        c.scale_app(old_appid, instances=0)
                        check_deployment(c, old_appid)
        else:
            print("Started deployment with downtime...")
            for old_appid in old_appids:
                c.scale_app(old_appid, instances=0)
                check_deployment(c, old_appid)
            c.scale_app(new_app_id, instances=instances)
            check_deployment(c, new_app_id)
            check_health(c, new_app_id)

    print("\nSUCCESS:\nNew application ID:", new_app_id,
          "\nRunning instances:", instances)
Example #30
0
class MarathonIF(object):
    def __init__(self, marathon_addr, my_addr, mesos):
        self.mcli = MarathonClient(marathon_addr)
        self.myAddr = my_addr
        self.mesos = mesos

    def get_apps(self):
        listapps = self.mcli.list_apps()
        return listapps

    def get_app(self, app_id, timeout=300):
        st_time = time.time()
        while (time.time() - st_time < timeout):
            try:
                try:
                    a = self.mcli.get_app(app_id)
                except marathon.exceptions.NotFoundError as e:  # NOQA
                    return None
                return a
            except:
                l.info("mcli: get_app returned error")
                l.info(traceback.format_exc())
                l.info("Retrying after 10 secs timeout=%d", timeout)
                time.sleep(10)
        raise Exception(
            "mcli get_app timed out, possible zookeper/marathon/mesos malfunction"
        )

    def delete_app(self, app_id, force=False, timeout=200):
        st_time = time.time()
        while (time.time() - st_time < timeout):
            try:
                self.mcli.delete_app(app_id, force)
                return
            except:
                l.info("mcli: delete_app returned error")
                l.info(traceback.format_exc())
                l.info("Retrying after 10 secs timeout=%d", timeout)
                time.sleep(10)
        raise Exception(
            "mcli delete_app timed out, possible zookeper/marathon/mesos malfunction"
        )

    def delete_deployment(self, dep_id):
        return self.mcli.delete_deployment(dep_id)

    def get_deployments(self):
        return self.mcli.list_deployments()

    def delete_app_ifexisting(self, app_id, trys=4):
        for idx in range(0, trys):
            try:
                a = self.get_app(app_id)
                if a:
                    return self.delete_app(app_id)
                return None
            except:
                e = sys.exc_info()[0]
                pprint("<p>Error: %s</p>" % e)
                time.sleep(10)
        raise

    @staticmethod
    def is_valid_app_id(app_id):
        # allowed: lowercase letters, digits, hyphens, slash, dot
        if re.match("^[A-Za-z0-9-/.]*$", app_id):
            return True
        return False

    def create_app(self, app_id, attr):
        """
            Create and start an app.
            :param app_id: (str) - Application ID
            :param attr: marathon.models.app.MarathonApp application to create.
            :return: the created app
        """
        # Validate that app_id conforms to allowed naming scheme.
        if not self.is_valid_app_id(app_id):
            l.error(
                "Error: Only lowercase letters, digits, hyphens are allowed in app_id. %s"
                % app_id)
            raise Exception("Invalid app_id")

        for idx in range(0, 10):
            try:
                a = self.mcli.create_app(app_id, attr)
                return a
            except marathon.exceptions.MarathonHttpError as e:
                if str(
                        e
                ).find('App is locked by one or more deployments. Override with the option'
                       ) >= 0:
                    time.sleep(1)
                else:
                    raise
        raise

    def wait_app_removal(self, app):
        cnt = 0
        while True:
            if not self.get_app(app):
                break
            time.sleep(0.2)
            cnt += 1
            if cnt > 0:
                l.info("Stuck waiting for %s to be deleted CNT=%d" %
                       (app, cnt))
        return True

    def wait_app_ready(self, app, running_count, sleep_before_next_try=1):
        cnt = 0
        while True:
            a1 = self.get_app(app)
            # if tasks_running are greater (due to whatever reason, scale down accordingly)
            if a1.tasks_running > running_count:
                delta = a1.tasks_running - running_count
                l.info("Found [%d] more apps, scaling down to [%d]", delta,
                       running_count)
                self.scale_app(app, running_count)
                # Allow for some time before next poll
                time.sleep(1)
                continue
            if a1.tasks_running == running_count:
                return a1
            cnt += 1
            time.sleep(sleep_before_next_try)
            if (cnt % 30) == 29:
                l.info(
                    "[%d]Waiting for task to move to running stage, " % cnt +
                    "current stat staged=%d running=%d expected Running=%d" %
                    (a1.tasks_staged, a1.tasks_running, running_count))

    def scale_app(self, app, scale, timeout=300):
        st_time = time.time()
        while (time.time() - st_time < timeout):
            try:
                self.mcli.scale_app(app, scale)
                return
            except:
                l.info("mcli: scale_app returned error")
                l.info(traceback.format_exc())
                l.info("Retrying after 10 secs timeout=%d", timeout)
                time.sleep(10)
        raise Exception(
            "mcli scale_app timed out, possible zookeper/marathon/mesos malfunction"
        )

    def ping(self):
        return self.mcli.ping()

    def kill_task(self, app_id, task_id):
        return self.mcli.kill_task(app_id, task_id)
  uris = ['https://s3-us-west-1.amazonaws.com/streaming-artifacts/twitter-consumer.tar.gz']
  name = '{}/{}'.format(instance_name, ''.join([i for i in movie if i.isalpha() or i == ' ']).lower().replace(' ', '-'))

  health_checks = [MarathonHealthCheck(grace_period_seconds=300, interval_seconds=60, max_consecutive_failures=3,
                                       protocol='COMMAND', timeout_seconds=20, ignore_http1xx=False,
                                       command={"value": 'test ! -z \"$(ps ax|egrep \"(twitter-consumer)*.(jar)\"|grep -v grep)\"'})]

  consumer_app = Node(name=name, image='java:8', labels=labels, cmd=cmd, env=env, uris=uris, cpus=0.1, mem=256, disk=0,
                      health_checks=health_checks)

  add_movie = CassandraAddMovie(marathon_client, cassandra_host, cassandra_port, movie)
  add_movie.commit()

  time.sleep(7)

  try:
    marathon_client.create_app(name, consumer_app.app)
  except:
    pass

  instances[instance_name] = {
    'instanceId': instance_id,
    'name': instance_name,
    '$set': {
      'status.flags.converging': True,
      'status.flags.active': False
    }
  }

return_instances_info(instances)
Example #32
0
 def create_app_from_json(self, json_data ):
   a = MarathonApp.from_json(json_data)
   return MarathonClient.create_app(self, a.id, a)