Ejemplo n.º 1
0
    def remove_app(self, appname):
        """Remove app from scheduler."""
        if appname not in self.cell.apps:
            return

        app = self.cell.apps[appname]

        if app.server:
            self.backend.delete(z.path.placement(app.server, appname))

        if self.app_events_dir:
            trace.post(self.app_events_dir,
                       app_events.DeletedTraceEvent(instanceid=appname))

        # If finished does not exist, it means app is terminated by
        # explicit request, not because it finished on the node.
        if not self.backend.exists(z.path.finished(appname)):
            self.backend.put(
                z.path.finished(appname),
                {
                    'state': 'terminated',
                    'when': time.time(),
                    'host': app.server,
                    'data': None
                },
            )

        super(Master, self).remove_app(appname)
Ejemplo n.º 2
0
def _post_oom_event(tm_env, appname):
    """Post killed event due to oom."""
    trace.post(tm_env.app_events_dir,
               events.KilledTraceEvent(
                   instanceid=appname,
                   is_oom=True,
               ))
Ejemplo n.º 3
0
def _post_exit_event(tm_env, appname, exitinfo):
    """Post finished event based on exit info."""
    trace.post(
        tm_env.app_events_dir,
        events.FinishedTraceEvent(instanceid=appname,
                                  rc=exitinfo.get('return_code', 256),
                                  signal=exitinfo.get('signal', 256),
                                  payload=exitinfo))
Ejemplo n.º 4
0
 def _abort_task(self, appname, exception):
     """Set task into aborted state in case of scheduling error."""
     if self.app_events_dir:
         trace.post(
             self.app_events_dir,
             app_events.AbortedTraceEvent(
                 instanceid=appname,
                 why=app_abort.AbortedReason.SCHEDULER.value,
                 payload=exception))
Ejemplo n.º 5
0
    def _run(self, manifest):
        context.GLOBAL.zk.conn.add_listener(zkutils.exit_on_lost)

        with lc.LogContext(_LOGGER, self._service.name,
                           lc.ContainerAdapter) as log:
            log.info('Running %r', self._service.directory)

            manifest['ephemeral_ports']['tcp'] = []
            manifest['ephemeral_ports']['udp'] = []

            _create_docker_log_symlink(self._service.data_dir)

            app = runtime.save_app(manifest, self._service.data_dir)

            volume_mapping = self._get_volume_mapping()

            app_presence = presence.EndpointPresence(context.GLOBAL.zk.conn,
                                                     manifest)

            app_presence.register_identity()
            app_presence.register_running()

            client = self._get_client()

            try:
                container = _create_container(self._tm_env, self._get_config(),
                                              client, app, volume_mapping)
            except docker.errors.ImageNotFound:
                raise exc.ContainerSetupError(
                    'Image {0} was not found'.format(app.image),
                    app_abort.AbortedReason.IMAGE)

            container.start()
            container.reload()

            _update_network_info_in_manifest(container, manifest)
            # needs to share manifest with container
            if volume_mapping:
                container_data_dir = next(iter(volume_mapping))
                runtime.save_app(manifest,
                                 container_data_dir,
                                 app_json='app.json')

            _LOGGER.info('Container is running.')
            app_presence.register_endpoints()
            trace.post(
                self._tm_env.app_events_dir,
                events.ServiceRunningTraceEvent(instanceid=app.name,
                                                uniqueid=app.uniqueid,
                                                service='docker'))

            _print_container_logs(container)
Ejemplo n.º 6
0
def report_aborted(tm_env, instance, why=None, payload=None):
    """Report an aborted instance.

    Called when aborting after failed configure step or from cleanup.
    """
    if payload is not None:
        payload = str(payload)

    trace.post(
        tm_env.app_events_dir,
        events.AbortedTraceEvent(instanceid=instance,
                                 why=_why_str(why),
                                 payload=payload))
Ejemplo n.º 7
0
 def _update_task(self, appname, server, why):
     """Creates/updates application task with the new placement."""
     # Servers in the cell have full control over task node.
     if self.app_events_dir:
         if server:
             trace.post(
                 self.app_events_dir,
                 app_events.ScheduledTraceEvent(instanceid=appname,
                                                where=server,
                                                why=why))
         else:
             trace.post(
                 self.app_events_dir,
                 app_events.PendingTraceEvent(instanceid=appname, why=why))
Ejemplo n.º 8
0
    def _finish(self):
        app = runtime.load_app(self._service.data_dir, runtime.STATE_JSON)

        if app:
            client = self._get_client()
            container = state = None
            name = appcfg.app_unique_name(app)
            try:
                container = client.containers.get(name)
                state = container.attrs.get('State')
            except docker.errors.NotFound:
                pass

            if container is not None:
                try:
                    container.remove(force=True)
                except docker.errors.APIError:
                    _LOGGER.error('Failed to remove %s', container.id)

            aborted = _check_aborted(self._service.data_dir)
            if aborted is not None:
                app_abort.report_aborted(self._tm_env,
                                         app.name,
                                         why=aborted.get('why'),
                                         payload=aborted.get('payload'))

            elif state is not None:
                if state.get('OOMKilled', False):
                    event = events.KilledTraceEvent(
                        instanceid=app.name,
                        is_oom=True,
                    )
                else:
                    event = events.FinishedTraceEvent(instanceid=app.name,
                                                      rc=state.get(
                                                          'ExitCode', 256),
                                                      signal=0,
                                                      payload=state)

                trace.post(self._tm_env.app_events_dir, event)

            if os.name == 'nt':
                credential_spec.cleanup(name, client)

            try:
                runtime.archive_logs(self._tm_env, name,
                                     self._service.data_dir)
            except Exception:  # pylint: disable=W0703
                _LOGGER.exception('Unexpected exception storing local logs.')
Ejemplo n.º 9
0
    def process_blackedout_servers(self, servers):
        """Callback invoked when server blacklist is modified."""
        events = []
        servers_blacklist = set(servers)

        for servername in servers_blacklist - self.servers_blacklist:
            _LOGGER.info('Server blackout: %s', servername)
            events.append(
                server_events.ServerBlackoutTraceEvent(servername=servername))

        for servername in self.servers_blacklist - servers_blacklist:
            _LOGGER.info('Server blackout cleared: %s', servername)
            events.append(
                server_events.ServerBlackoutClearedTraceEvent(
                    servername=servername))

        for event in events:
            if self.server_events_dir:
                trace.post(self.server_events_dir, event)
        self.servers_blacklist = servers_blacklist
Ejemplo n.º 10
0
    def _record_server_state(self, servername):
        """Record server state."""
        super(Master, self)._record_server_state(servername)

        server = self.servers.get(servername)
        if not server:
            _LOGGER.warning('Server not found: %s', servername)
            return

        placement_node = z.path.placement(servername)
        state, since = server.get_state()
        self.backend.put(placement_node, {
            'state': state.value,
            'since': since
        })

        if self.server_events_dir:
            trace.post(
                self.server_events_dir,
                server_events.ServerStateTraceEvent(servername=servername,
                                                    state=state.value))
Ejemplo n.º 11
0
def configure(tm_env, event, runtime, runtime_param=None):
    """Creates directory necessary for starting the application.
    :param runtime_param:
        describe runtime paramater
    :type runtime_param:
        ``str list``
        if not None contains list of 'parami=xyz' used for passing param
         to runtime
    This operation is idem-potent (it can be repeated).

    The directory layout is::

        - (treadmill root)/
          - apps/
            - (app unique name)/
              - data/
                - app_start
                - app.json
                - manifest.yml
                env/
                - TREADMILL_*
                run
                finish
                log/
                - run

    The 'run' script is responsible for creating container environment
    and starting the container.

    The 'finish' script is invoked when container terminates and will
    deallocate any resources (NAT rules, etc) that were allocated for the
    container.
    """
    # Load the app from the event
    try:
        manifest_data = load_runtime_manifest(tm_env, event, runtime)
    except IOError:
        # File is gone. Nothing to do.
        _LOGGER.exception('No event to load: %r', event)
        return None

    # Freeze the app data into a namedtuple object
    app = utils.to_obj(manifest_data)

    # Generate a unique name for the app
    uniq_name = appcfg.app_unique_name(app)

    # Write the actual container start script
    if os.name == 'nt':
        run_script = (
            '{treadmill}/scripts/treadmill sproc run {param} .'.format(
                treadmill=subproc.resolve('treadmill'),
                param='--runtime-param {}'.format(','.join(runtime_param))
                if runtime_param else '',
            ))
    else:
        run_script = 'exec {treadmill}/bin/treadmill sproc run ../'.format(
            treadmill=subproc.resolve('treadmill'), )

    # Create the service for that container
    container_svc = supervisor.create_service(
        tm_env.apps_dir,
        name=uniq_name,
        app_run_script=run_script,
        userid='root',
        downed=False,
        monitor_policy={
            'limit': 0,
            'interval': 60,
            'tombstone': {
                'uds': False,
                'path': tm_env.running_tombstone_dir,
                'id': app.name
            }
        },
        environ={},
        environment=app.environment)
    data_dir = container_svc.data_dir

    # Copy the original event as 'manifest.yml' in the container dir
    try:
        shutil.copyfile(event, os.path.join(data_dir, 'manifest.yml'))
    except IOError as err:
        # File is gone, cleanup.
        if err.errno == errno.ENOENT:
            shutil.rmtree(container_svc.directory)
            _LOGGER.exception('Event gone: %r', event)
            return None
        else:
            raise

    # Store the app.json in the container directory
    fs.write_safe(os.path.join(data_dir, appcfg.APP_JSON),
                  lambda f: f.writelines(utils.json_genencode(manifest_data)),
                  mode='w',
                  permission=0o644)

    trace.post(
        tm_env.app_events_dir,
        events.ConfiguredTraceEvent(instanceid=app.name,
                                    uniqueid=app.uniqueid))

    return container_svc.directory
Ejemplo n.º 12
0
    def test_post(self):
        """Test trace.post."""
        # Disable W0212(protected-access)
        # pylint: disable=W0212
        zkclient_mock = mock.Mock()
        zkclient_mock.get_children.return_value = []
        publisher = events_publisher.EventsPublisher(
            zkclient_mock,
            app_events_dir=self.app_events_dir,
            server_events_dir=self.server_events_dir
        )

        trace.post(
            self.app_events_dir,
            app_events.PendingTraceEvent(
                instanceid='foo.bar#123',
                why='created',
            )
        )
        path = os.path.join(
            self.app_events_dir, '100,foo.bar#123,pending,created'
        )
        self.assertTrue(os.path.exists(path))
        publisher._on_created(path, app_zk.publish)
        zkclient_mock.create.assert_called_once_with(
            '/trace/007B/foo.bar#123,100,baz,pending,created',
            b'',
            ephemeral=False, makepath=True, sequence=False,
            acl=mock.ANY
        )

        zkclient_mock.reset_mock()
        trace.post(
            self.app_events_dir,
            app_events.PendingDeleteTraceEvent(
                instanceid='foo.bar#123',
                why='deleted'
            )
        )
        path = os.path.join(
            self.app_events_dir, '100,foo.bar#123,pending_delete,deleted'
        )
        self.assertTrue(os.path.exists(path))
        publisher._on_created(path, app_zk.publish)
        zkclient_mock.create.assert_called_once_with(
            '/trace/007B/foo.bar#123,100,baz,pending_delete,deleted',
            b'',
            ephemeral=False, makepath=True, sequence=False,
            acl=mock.ANY
        )

        zkclient_mock.reset_mock()
        trace.post(
            self.app_events_dir,
            app_events.AbortedTraceEvent(
                instanceid='foo.bar#123',
                why='test'
            )
        )
        path = os.path.join(
            self.app_events_dir, '100,foo.bar#123,aborted,test'
        )
        self.assertTrue(os.path.exists(path))
        publisher._on_created(path, app_zk.publish)
        self.assertEqual(zkclient_mock.create.call_args_list, [
            mock.call(
                '/trace/007B/foo.bar#123,100,baz,aborted,test',
                b'',
                ephemeral=False, makepath=True, sequence=False,
                acl=mock.ANY
            ),
            mock.call(
                '/finished/foo.bar#123',
                json.dumps({
                    'data': 'test',
                    'host': 'baz',
                    'state': 'aborted',
                    'when': '100'
                }, sort_keys=True).encode(),
                makepath=True,
                ephemeral=False,
                acl=mock.ANY,
                sequence=False
            )
        ])

        zkclient_mock.reset_mock()
        trace.post(
            self.server_events_dir,
            server_events.ServerStateTraceEvent(
                servername='test.xx.com',
                state='up'
            )
        )
        path = os.path.join(
            self.server_events_dir, '100,test.xx.com,server_state,up'
        )
        self.assertTrue(os.path.exists(path))
        publisher._on_created(path, server_zk.publish)
        zkclient_mock.create.assert_called_once_with(
            '/server-trace/005D/test.xx.com,100,baz,server_state,up',
            b'',
            ephemeral=False, makepath=True, sequence=False,
            acl=mock.ANY
        )

        zkclient_mock.reset_mock()
        trace.post(
            self.server_events_dir,
            server_events.ServerBlackoutTraceEvent(
                servername='test.xx.com'
            )
        )
        path = os.path.join(
            self.server_events_dir, '100,test.xx.com,server_blackout,'
        )
        self.assertTrue(os.path.exists(path))
        publisher._on_created(path, server_zk.publish)
        zkclient_mock.create.assert_called_once_with(
            '/server-trace/005D/test.xx.com,100,baz,server_blackout,',
            b'',
            ephemeral=False, makepath=True, sequence=False,
            acl=mock.ANY
        )
Ejemplo n.º 13
0
    def _run(self, manifest):
        context.GLOBAL.zk.conn.add_listener(zkutils.exit_on_lost)

        with lc.LogContext(_LOGGER, self._service.name,
                           lc.ContainerAdapter) as log:
            log.info('Running %r', self._service.directory)

            manifest['ephemeral_ports']['tcp'] = []
            manifest['ephemeral_ports']['udp'] = []

            # create container_data dir
            container_data_dir = os.path.join(self._service.data_dir,
                                              'container_data')

            log.info('container_data %r', container_data_dir)

            fs.mkdir_safe(container_data_dir)

            # volume mapping config : read-only mapping
            volume_mapping = {
                container_data_dir: {
                    'bind': 'c:\\container_data',
                    'mode': 'ro'
                }
            }

            app = runtime.save_app(manifest, self._service.data_dir)

            app_presence = presence.EndpointPresence(context.GLOBAL.zk.conn,
                                                     manifest)

            app_presence.register_identity()
            app_presence.register_running()

            client = self._get_client()

            try:
                container = _create_container(self._tm_env, self._get_config(),
                                              client, app, volume_mapping)
            except docker.errors.ImageNotFound:
                raise exc.ContainerSetupError(
                    'Image {0} was not found'.format(app.image),
                    app_abort.AbortedReason.IMAGE)

            container.start()
            container.reload()

            _update_network_info_in_manifest(container, manifest)
            runtime.save_app(manifest, container_data_dir, app_json='app.json')

            _LOGGER.info('Container is running.')
            app_presence.register_endpoints()
            trace.post(
                self._tm_env.app_events_dir,
                events.ServiceRunningTraceEvent(instanceid=app.name,
                                                uniqueid=app.uniqueid,
                                                service='docker'))

            while container.status == 'running':
                container.wait(timeout=10)
                container.reload()