Ejemplo n.º 1
0
def main():
    args = parse_args()
    processor = TaskProcessor()
    processor.load_plugin(provider_module='task_processing.plugins.mesos')
    mesos_executor = processor.executor_from_config(provider='mesos_task',
                                                    provider_config={
                                                        'secret': args.secret,
                                                        'mesos_address':
                                                        args.master,
                                                        'pool': args.pool,
                                                        'role': args.role,
                                                    })

    executor = processor.executor_from_config(provider='logging',
                                              provider_config={
                                                  'downstream_executor':
                                                  mesos_executor,
                                              })

    TaskConfig = mesos_executor.TASK_CONFIG_INTERFACE
    runner = Sync(executor=executor)
    task_config = TaskConfig(
        image="ubuntu:14.04",
        cmd="bash -c 'for i in $(seq 1 5); do echo $i&&sleep 10; done'")
    result = runner.run(task_config)
    print(result)

    runner.stop()
Ejemplo n.º 2
0
def main():
    mesos_address = os.getenv('MESOS', 'mesosmaster:5050')
    with open('./examples/cluster/secret') as f:
        secret = f.read().strip()

    processor = TaskProcessor()
    for p in ['mesos', 'stateful']:
        processor.load_plugin(provider_module='task_processing.plugins.' + p)
    mesos_executor = processor.executor_from_config(provider='mesos_task',
                                                    provider_config={
                                                        'secret': secret,
                                                        'mesos_address':
                                                        mesos_address,
                                                        'role': 'taskproc',
                                                    })
    executor = processor.executor_from_config(
        provider='stateful',
        provider_config={
            'downstream_executor': mesos_executor,
            'persister': FilePersistence(output_file='/tmp/foo')
        })

    runner = Sync(executor=executor)
    tasks = set()
    TaskConfig = mesos_executor.TASK_CONFIG_INTERFACE
    for _ in range(1, 2):
        task_config = TaskConfig(image='busybox', cmd='/bin/true')
        tasks.add(task_config.task_id)
        runner.run(task_config)
        print(executor.status(task_config.task_id))
Ejemplo n.º 3
0
def main():
    args = parse_args()

    processor = TaskProcessor()
    processor.load_plugin(provider_module='task_processing.plugins.mesos')
    executor = processor.executor_from_config(provider='mesos_task',
                                              provider_config={
                                                  'secret': args.secret,
                                                  'mesos_address': args.master,
                                                  'role': args.role,
                                              })

    counter = Counter()
    runner = Async(executor, [
        EventHandler(predicate=lambda x: x.terminal, cb=counter.process_event)
    ])

    TaskConfig = executor.TASK_CONFIG_INTERFACE
    tasks_to_launch = 2
    for _ in range(tasks_to_launch):
        task_config = TaskConfig(image='busybox', cmd='/bin/true')
        runner.run(task_config)

    for _ in range(5):
        print('terminated {} tasks'.format(counter.terminated))
        if counter.terminated >= tasks_to_launch:
            break
        time.sleep(2)

    runner.stop()
    return 0 if counter.terminated >= tasks_to_launch else 1
Ejemplo n.º 4
0
def main():
    args = parse_args()
    processor = TaskProcessor()
    processor.load_plugin(provider_module='task_processing.plugins.mesos')
    executor = processor.executor_from_config(provider='mesos_task',
                                              provider_config={
                                                  'secret': args.secret,
                                                  'mesos_address': args.master,
                                                  'pool': args.pool,
                                                  'role': args.role,
                                              })

    TaskConfig = executor.TASK_CONFIG_INTERFACE
    task_config = TaskConfig(image="busybox", cmd='/bin/true')
    # This only works on agents that have added mesos as a containerizer
    # task_config = TaskConfig(containerizer='MESOS', cmd='/bin/true')

    with ThreadPoolExecutor(max_workers=2) as futures_executor:
        runner = Promise(executor, futures_executor)
        future = runner.run(task_config)
        wait([future])
        result = future.result()
        print(result)
        print(result.raw)
        runner.stop()

    return 0 if result.success else 1
Ejemplo n.º 5
0
def main():
    args = parse_args()
    processor = TaskProcessor()
    processor.load_plugin(provider_module='task_processing.plugins.mesos')
    mesos_executor = processor.executor_from_config(provider='mesos_task',
                                                    provider_config={
                                                        'secret': args.secret,
                                                        'mesos_address':
                                                        args.master,
                                                        'pool': args.pool,
                                                        'role': args.role,
                                                    })

    executor = processor.executor_from_config(provider='timeout',
                                              provider_config={
                                                  'downstream_executor':
                                                  mesos_executor,
                                              })

    TaskConfig = mesos_executor.TASK_CONFIG_INTERFACE
    runner = Sync(executor=executor)
    task_config = TaskConfig(image='docker-dev.yelpcorp.com/dumb-busybox',
                             cmd='exec dumb-init /bin/sleep 30',
                             timeout=10)
    result = runner.run(task_config)
    print(result)

    runner.stop()
def test_load_plugin():
    tp = TaskProcessor()
    tp.load_plugin('tests.mock_plugin')

    assert 'mock_plugin' in tp.registry.plugin_modules
    assert 'dummy' in tp.registry.task_executors
    assert 'dummy2' in tp.registry.task_executors

    with pytest.raises(ValueError):
        tp.load_plugin('tests.mock_plugin')
def test_executor_from_config():
    tp = TaskProcessor()
    tp.load_plugin('tests.mock_plugin')

    executor = tp.executor_from_config(provider='dummy',
                                       provider_config={'arg': 'foobar'})
    assert executor.arg == 'foobar'
    executor.run(None)
    executor.kill(None)

    with pytest.raises(ValueError):
        tp.executor_from_config('lol')
Ejemplo n.º 8
0
def main():
    mesos_address = os.getenv('MESOS', 'mesosmaster:5050')
    with open('./examples/cluster/secret') as f:
        secret = f.read().strip()

    processor = TaskProcessor()
    for p in ['mesos', 'stateful']:
        processor.load_plugin(provider_module='task_processing.plugins.' + p)
    mesos_executor = processor.executor_from_config(provider='mesos_task',
                                                    provider_config={
                                                        'secret': secret,
                                                        'mesos_address':
                                                        mesos_address,
                                                        'role': 'taskproc',
                                                    })

    s = session.Session(region_name='foo',
                        aws_access_key_id='foo',
                        aws_secret_access_key='bar')
    dynamo_address = os.getenv('DYNAMO', 'http://dynamodb:5050')
    client = s.client(
        service_name='dynamodb',
        endpoint_url=dynamo_address,
    )
    try:
        create_table(client)
    except ClientError:
        pass

    executor = processor.executor_from_config(
        provider='stateful',
        provider_config={
            'downstream_executor':
            mesos_executor,
            'persister':
            DynamoDBPersister(table_name='events',
                              endpoint_url=dynamo_address,
                              session=s)
        })
    runner = Sync(executor=executor)
    tasks = set()
    TaskConfig = mesos_executor.TASK_CONFIG_INTERFACE
    for _ in range(1, 2):
        task_config = TaskConfig(image='ubuntu:14.04', cmd='/bin/sleep 2')
        tasks.add(task_config.task_id)
        runner.run(task_config)
        print(executor.status(task_config.task_id))
Ejemplo n.º 9
0
def main():
    # get address of the Mesos cluster
    mesos_address = os.getenv('MESOS', 'mesosmaster:5050')

    # read in secret, this is used to authenticate the taskproc scheduler with
    # Mesos
    with open('./examples/cluster/secret') as f:
        secret = f.read().strip()

    # create a processor instance
    processor = TaskProcessor()

    # configure plugins
    processor.load_plugin(provider_module='task_processing.plugins.mesos')

    # create an executor (taskproc executor NOT to be confused with a Mesos
    # executor) using this defined configuration. this config can also be used
    # to specify other Mesos properties, such as which role to use
    executor = processor.executor_from_config(
        provider='mesos_task',
        provider_config={
            'secret': secret,
            'mesos_address': mesos_address,
            'role': 'taskproc',
        }
    )

    # creates a new Sync runner that will synchronously execute tasks
    # (i.e. block until completion)
    runner = Sync(executor)

    # next, create a TaskConfig to run
    # this is where properties of the Mesos task can be specified in this
    # example, we use the busybox Docker image and just echo "hello world"
    TaskConfig = executor.TASK_CONFIG_INTERFACE
    task_config = TaskConfig(image="busybox", cmd='echo "hello world"')

    # run our task and print the result
    result = runner.run(task_config)
    print(result)

    # this stops the taskproc framework and unregisters it from Mesos
    runner.stop()

    return 0 if result.success else 1
Ejemplo n.º 10
0
def main():
    mesos_address = os.environ['MESOS']
    with open('./examples/cluster/secret') as f:
        secret = f.read().strip()

    processor = TaskProcessor()
    processor.load_plugin(provider_module='task_processing.plugins.mesos')
    executor = processor.executor_from_config(provider='mesos_task',
                                              provider_config={
                                                  'secret': secret,
                                                  'mesos_address':
                                                  mesos_address,
                                                  'role': 'taskproc',
                                              })

    queue = Queue(100)
    runner = Subscription(executor, queue)

    tasks = set()
    TaskConfig = executor.TASK_CONFIG_INTERFACE
    for _ in range(2):
        task_config = TaskConfig(image='busybox', cmd='/bin/true')
        tasks.add(task_config.task_id)
        runner.run(task_config)

    print('Running {} tasks: {}'.format(len(tasks), tasks))
    while len(tasks) > 0:
        try:
            event = queue.get(block=True, timeout=10)
        except Empty:
            event = None

        if event is None:
            print('Timeout while waiting for {}'.format(tasks))
            break
        else:
            if event.terminal:
                tasks.discard(event.task_id)

    runner.stop()
    return 0 if len(tasks) == 0 else 1
Ejemplo n.º 11
0
def main():
    c = Counter()
    args = parse_args()
    processor = TaskProcessor()
    processor.load_plugin(provider_module='task_processing.plugins.mesos')
    mesos_executor = processor.executor_from_config(provider='mesos_task',
                                                    provider_config={
                                                        'secret': args.secret,
                                                        'mesos_address':
                                                        args.master,
                                                        'pool': args.pool,
                                                        'role': args.role,
                                                    })

    TaskConfig = mesos_executor.TASK_CONFIG_INTERFACE
    runner = Async(
        mesos_executor,
        [EventHandler(
            predicate=lambda x: x.terminal,
            cb=c.process_event,
        )])
    timeout_task_config = TaskConfig(
        image='busybox',
        cmd='exec /bin/sleep 100',
        offer_timeout=5.0,
        cpus=20,
        mem=2048,
        disk=2000,
    )
    runner.run(timeout_task_config)

    for _ in range(50):
        if c.terminated >= 1:
            break
        print("waiting for task %s to finish" % (timeout_task_config.task_id))
        time.sleep(2)

    runner.stop()
    return 0
Ejemplo n.º 12
0
def remote_run_start(args):

    system_paasta_config, service, cluster, \
        soa_dir, instance, instance_type = extract_args(args)
    overrides_dict = {}

    constraints_json = args.constraints_json
    if constraints_json:
        try:
            constraints = json.loads(constraints_json)
        except Exception as e:
            paasta_print("Error while parsing constraints: %s", e)

        if constraints:
            overrides_dict['constraints'] = constraints

    if args.cmd:
        overrides_dict['cmd'] = args.cmd

    if args.instances:
        overrides_dict['instances'] = args.instances

    run_id = args.run_id
    if run_id is None:
        run_id = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(8))
        paasta_print("Assigned random run-id: %s" % run_id)

    if args.detach:
        paasta_print("Running in background")
        if os.fork() > 0:
            return
        os.setsid()
        if os.fork() > 0:
            return
        sys.stdout = open('/dev/null', 'w')
        sys.stderr = open('/dev/null', 'w')

    paasta_print('Scheduling a task on Mesos')

    processor = TaskProcessor()
    processor.load_plugin(provider_module='task_processing.plugins.mesos')
    processor.load_plugin(provider_module='task_processing.plugins.stateful')

    MesosExecutor = processor.executor_cls(provider='mesos')

    native_job_config = load_paasta_native_job_config(
        service,
        instance,
        cluster,
        soa_dir=soa_dir,
        instance_type=instance_type,
        config_overrides=overrides_dict,
        load_deployments=not args.docker_image,
    )
    try:
        task_config = MesosExecutor.TASK_CONFIG_INTERFACE(
            **paasta_to_task_config_kwargs(
                service=service,
                instance=instance,
                system_paasta_config=system_paasta_config,
                native_job_config=native_job_config,
                config_overrides=overrides_dict,
                docker_image=args.docker_image,
                offer_timeout=args.staging_timeout,
            ), )
    except InvariantException as e:
        if len(e.missing_fields) > 0:
            paasta_print(
                PaastaColors.red(
                    "Mesos task config is missing following fields: {}".format(
                        ', '.join(e.missing_fields), ), ), )
        elif len(e.invariant_errors) > 0:
            paasta_print(
                PaastaColors.red(
                    "Mesos task config is failing following checks: {}".format(
                        ', '.join(str(ie)
                                  for ie in e.invariant_errors), ), ), )
        else:
            paasta_print(PaastaColors.red(f"Mesos task config error: {e}"), )
        traceback.print_exc()
        emit_counter_metric('paasta.remote_run.start.failed', service,
                            instance)
        sys.exit(1)
    except PTypeError as e:
        paasta_print(
            PaastaColors.red(
                f"Mesos task config is failing a type check: {e}", ), )
        traceback.print_exc()
        emit_counter_metric('paasta.remote_run.start.failed', service,
                            instance)
        sys.exit(1)

    def handle_interrupt(_signum, _frame):
        paasta_print(
            PaastaColors.red("Signal received, shutting down scheduler."), )
        if runner is not None:
            runner.stop()
        if _signum == signal.SIGTERM:
            sys.exit(143)
        else:
            sys.exit(1)

    signal.signal(signal.SIGINT, handle_interrupt)
    signal.signal(signal.SIGTERM, handle_interrupt)

    default_role = system_paasta_config.get_remote_run_config().get(
        'default_role')
    assert default_role

    try:
        executor_stack = build_executor_stack(
            processor=processor,
            service=service,
            instance=instance,
            role=native_job_config.get_role() or default_role,
            pool=native_job_config.get_pool(),
            cluster=cluster,
            run_id=run_id,
            system_paasta_config=system_paasta_config,
            framework_staging_timeout=args.staging_timeout,
        )
        runner = Sync(executor_stack)

        terminal_event = runner.run(task_config)
        runner.stop()
    except (Exception, ValueError) as e:
        paasta_print("Except while running executor stack: %s", e)
        traceback.print_exc()
        emit_counter_metric('paasta.remote_run.start.failed', service,
                            instance)
        sys.exit(1)

    if terminal_event.success:
        paasta_print("Task finished successfully")
        sys.exit(0)
    else:
        paasta_print(PaastaColors.red(f"Task failed: {terminal_event.raw}"), )
        # This is not necessarily an infrastructure failure. It may just be a
        # application failure.
        emit_counter_metric('paasta.remote_run.start.failed', service,
                            instance)
        sys.exit(1)
Ejemplo n.º 13
0
def test_import():
    from task_processing.task_processor import TaskProcessor
    tp = TaskProcessor()
    tp.load_plugin('task_processing.plugins.mesos')
Ejemplo n.º 14
0
def remote_run_start(args):
    system_paasta_config, service, cluster, soa_dir, instance, instance_type = extract_args(args)
    overrides_dict = {}

    constraints_json = args.constraints_json
    if constraints_json:
        try:
            constraints = json.loads(constraints_json)
        except Exception as e:
            paasta_print("Error while parsing constraints: %s", e)

        if constraints:
            overrides_dict['constraints'] = constraints

    if args.cmd:
        overrides_dict['cmd'] = args.cmd

    if args.instances:
        overrides_dict['instances'] = args.instances

    run_id = args.run_id
    if run_id is None:
        run_id = ''.join(
            random.choice(string.ascii_uppercase + string.digits)
            for _ in range(8)
        )
        paasta_print("Assigned random run-id: %s" % run_id)

    if args.detach:
        paasta_print("Running in background")
        if os.fork() > 0:
            return
        os.setsid()
        if os.fork() > 0:
            return
        sys.stdout = open('/dev/null', 'w')
        sys.stderr = open('/dev/null', 'w')

    paasta_print('Scheduling a task on Mesos')

    processor = TaskProcessor()
    processor.load_plugin(provider_module='task_processing.plugins.mesos')
    processor.load_plugin(provider_module='task_processing.plugins.stateful')

    MesosExecutor = processor.executor_cls(provider='mesos')
    task_config = MesosExecutor.TASK_CONFIG_INTERFACE(
        **paasta_to_task_config_kwargs(
            service,
            instance,
            cluster,
            system_paasta_config,
            instance_type,
            soa_dir=soa_dir,
            config_overrides=overrides_dict,
        ),
    )

    executor_stack = build_executor_stack(
        processor,
        service,
        instance,
        cluster,
        run_id,
        system_paasta_config,
        args.staging_timeout,
    )
    runner = Sync(executor_stack)

    def handle_interrupt(_signum, _frame):
        paasta_print(
            PaastaColors.red("Signal received, shutting down scheduler."),
        )
        runner.stop()
        if _signum == signal.SIGTERM:
            sys.exit(143)
        else:
            sys.exit(1)
    signal.signal(signal.SIGINT, handle_interrupt)
    signal.signal(signal.SIGTERM, handle_interrupt)

    terminal_event = runner.run(task_config)
    runner.stop()
    if terminal_event.success:
        paasta_print("Task finished successfully")
        sys.exit(0)
    else:
        paasta_print(
            PaastaColors.red("Task failed: {}".format(terminal_event.raw)),
        )
        sys.exit(1)
Ejemplo n.º 15
0
def remote_run_start(args):
    """ Start a task in Mesos
    Steps:
    1. Accumulate overrides
    2. Create task configuration
    3. Build executor stack
    4. Run the task on the executor stack
    """
    # accumulate all configuration needed to build what we need to run a task
    system_paasta_config, service, cluster, \
        soa_dir, instance, instance_type = extract_args(args)
    # TODO: move run_id into task identifier?
    run_id = args.run_id or generate_run_id(length=10)
    framework_name = create_framework_name(service, instance, run_id)
    overrides = accumulate_config_overrides(args, service, instance)
    # TODO: implement DryRunExecutor?
    taskproc_config = system_paasta_config.get_taskproc()
    native_job_config = load_paasta_native_job_config(
        service,
        instance,
        cluster,
        soa_dir=soa_dir,
        instance_type=instance_type,
        config_overrides=overrides,
        load_deployments=not args.docker_image,
    )
    region = args.aws_region or taskproc_config.get('aws_region')
    default_role = system_paasta_config.get_remote_run_config().get(
        'default_role')
    assert default_role
    role = native_job_config.get_role() or default_role
    pool = native_job_config.get_pool()
    processor = TaskProcessor()
    processor.load_plugin(provider_module='task_processing.plugins.stateful')
    processor.load_plugin(provider_module='task_processing.plugins.mesos')

    if args.detach:
        paasta_print("Running in background")
        if os.fork() > 0:
            return
        os.setsid()
        if os.fork() > 0:
            return
        sys.stdout = open('/dev/null', 'w')
        sys.stderr = open('/dev/null', 'w')

    # create factory functions for task_config and executors, which makes it
    # easier to recreate them for retry purposes
    def task_config_factory():
        return create_mesos_task_config(
            processor=processor,
            service=service,
            instance=instance,
            system_paasta_config=system_paasta_config,
            native_job_config=native_job_config,
            offer_timeout=args.staging_timeout,
            docker_image=args.docker_image,
        )

    framework_config = dict(
        cluster=cluster,
        framework_name=framework_name,
        framework_staging_timeout=args.staging_timeout,
        role=role,
        pool=pool,
    )
    executor_kwargs = dict(  # used to create mesos executor
        processor=processor,
        system_paasta_config=system_paasta_config,
        taskproc_config=taskproc_config,
        **framework_config,
    )

    def executor_factory():
        mesos_executor = create_mesos_executor(**executor_kwargs)
        return build_executor_stack(
            processor,
            mesos_executor,
            taskproc_config,
            cluster,
            region,
        )

    if args.dry_run:
        task_config_dict = task_config_to_dict(task_config_factory())
        pp = pprint.PrettyPrinter(indent=2)
        paasta_print(
            PaastaColors.green("Would have run task with:"),
            PaastaColors.green("Framework config:"),
            pp.pformat(framework_config),
            PaastaColors.green("Task config:"),
            pp.pformat(task_config_dict),
            sep='\n',
        )
        return

    terminals = run_tasks_with_retries(
        executor_factory,
        task_config_factory,
        retries=args.retries,
    )
    final_event, final_task_config = terminals[-1]
    exit_code = handle_terminal_event(
        event=final_event,
        service=service,
        instance=instance,
        run_id=run_id,
        email_address=args.notification_email,
        framework_config=framework_config,
        task_config=final_task_config,
    )
    sys.exit(exit_code)
Ejemplo n.º 16
0
class MesosCluster:
    def __init__(
        self,
        mesos_address,
        mesos_master_port=None,
        secret=None,
        principal=None,
        mesos_role=None,
        framework_id=None,
        enabled=True,
        default_volumes=None,
        dockercfg_location=None,
        offer_timeout=None,
    ):
        self.mesos_address = mesos_address
        self.mesos_master_port = mesos_master_port
        self.secret = secret
        self.principal = principal
        self.mesos_role = mesos_role
        self.enabled = enabled
        self.default_volumes = default_volumes or []
        self.dockercfg_location = dockercfg_location
        self.offer_timeout = offer_timeout
        self.framework_id = framework_id

        self.processor = TaskProcessor()
        self.queue = PyDeferredQueue()
        self.deferred = None
        self.runner = None
        self.tasks = {}

        self.processor.load_plugin(
            provider_module='task_processing.plugins.mesos')
        self.connect()

    def set_enabled(self, is_enabled):
        self.enabled = is_enabled
        if is_enabled:
            self.connect()
        else:
            self.stop(fail_tasks=True)

    def configure_tasks(
        self,
        default_volumes,
        dockercfg_location,
        offer_timeout,
    ):
        self.default_volumes = default_volumes
        self.dockercfg_location = dockercfg_location
        self.offer_timeout = offer_timeout

    def connect(self):
        self.runner = self.get_runner(self.mesos_address, self.queue)
        self.handle_next_event()

    def handle_next_event(self, deferred_result=None):
        if self.deferred and not self.deferred.called:
            log.warning(
                'Already have handlers waiting for next event in queue, '
                'not adding more')
            return
        self.deferred = self.queue.get()
        self.deferred.addCallback(self._process_event)
        self.deferred.addCallback(self.handle_next_event)
        self.deferred.addErrback(logError)
        self.deferred.addErrback(self.handle_next_event)

    def _check_connection(self):
        if self.runner.stopping:
            # Last framework was terminated for some reason, re-connect.
            log.info('Last framework stopped, re-connecting')
            self.connect()
        elif self.deferred.called:
            # Just in case callbacks are missing, re-add.
            self.handle_next_event()

    def submit(self, task):
        if not task:
            return

        if not self.enabled:
            task.log.info('Task failed to start, Mesos is disabled.')
            task.exited(1)
            return
        self._check_connection()

        mesos_task_id = task.get_mesos_id()
        self.tasks[mesos_task_id] = task
        env = task.get_config()['environment']
        clusterman_resource_str = env.get('CLUSTERMAN_RESOURCES')
        clusterman_metrics = get_clusterman_metrics()
        if clusterman_resource_str and clusterman_metrics:
            clusterman_resources = json.loads(clusterman_resource_str)
            cluster = env.get('EXECUTOR_CLUSTER', env.get('PAASTA_CLUSTER'))
            pool = env.get('EXECUTOR_POOL', env.get('PAASTA_POOL'))
            aws_region = staticconf.read(f'clusters.{cluster}.aws_region',
                                         namespace='clusterman')
            metrics_client = clusterman_metrics.ClustermanMetricsBotoClient(
                region_name=aws_region,
                app_identifier=pool,
            )
            with metrics_client.get_writer(
                    clusterman_metrics.APP_METRICS,
                    aggregate_meteorite_dims=True) as writer:
                for metric_key, metric_value in clusterman_resources.items():
                    writer.send((metric_key, int(time.time()), metric_value))
        self.runner.run(task.get_config())
        log.info(
            'Submitting task {} to {}'.format(
                mesos_task_id,
                self.mesos_address,
            ), )
        task.report_resources()

    def recover(self, task):
        if not task:
            return

        if not self.enabled:
            task.log.info('Could not recover task, Mesos is disabled.')
            task.exited(None)
            return
        self._check_connection()

        mesos_task_id = task.get_mesos_id()
        self.tasks[mesos_task_id] = task
        task.log.info(
            'TRON RESTARTED! Starting recovery procedure by reconciling state for this task from Mesos'
        )
        task.started()
        self.runner.reconcile(task.get_config())
        task.report_resources()

    def create_task(
        self,
        action_run_id,
        command,
        cpus,
        mem,
        disk,
        constraints,
        docker_image,
        docker_parameters,
        env,
        extra_volumes,
        serializer,
        task_id=None,
    ):
        if not self.runner:
            return None

        uris = [self.dockercfg_location] if self.dockercfg_location else []
        volumes = combine_volumes(self.default_volumes, extra_volumes)
        task_kwargs = {
            'name': action_run_id,
            'cmd': command,
            'cpus': cpus,
            'mem': mem,
            'disk': disk,
            'constraints': constraints,
            'image': docker_image,
            'docker_parameters': docker_parameters,
            'environment': env,
            'volumes': volumes,
            'uris': uris,
            'offer_timeout': self.offer_timeout,
        }
        task_config = self.runner.TASK_CONFIG_INTERFACE(**task_kwargs)

        if task_id is not None:
            try:
                task_config = task_config.set_task_id(task_id)
            except ValueError:
                log.error(f'Invalid {task_id} for {action_run_id}')
                return

        return MesosTask(action_run_id, task_config, serializer)

    def get_runner(self, mesos_address, queue):
        if not self.enabled:
            log.info('Mesos is disabled, not creating a framework.')
            return None

        if self.runner and not self.runner.stopping:
            log.info('Already have a running framework, not creating one.')
            return self.runner

        framework_name = 'tron-{}'.format(socket.gethostname())
        executor = self.processor.executor_from_config(
            provider='mesos_task',
            provider_config={
                'secret':
                self.secret,
                'principal':
                self.principal,
                'mesos_address':
                get_mesos_leader(mesos_address, self.mesos_master_port),
                'role':
                self.mesos_role,
                'framework_name':
                framework_name,
                'framework_id':
                self.framework_id,
                'failover':
                True,
            })

        def log_output(task_id, message, stream):
            logger = logging.getLogger('{}.{}.{}'.format(
                TASK_OUTPUT_LOGGER,
                task_id,
                stream,
            ))
            logger.info(message)

        logging_executor = self.processor.executor_from_config(
            provider='logging',
            provider_config={
                'downstream_executor': executor,
                'handler': log_output,
                'format_string': '{line}',
            },
        )
        return Subscription(logging_executor, queue)

    def _process_event(self, event):
        if event.kind == 'control':
            message = getattr(event, 'message', None)
            if message == 'stop':
                # Framework has been removed, stop it.
                log.warning('Framework has been stopped: {}'.format(event.raw))
                self.stop()
                MesosClusterRepository.remove(self.mesos_address)
            elif message == 'unknown':
                log.warning('Unknown error from Mesos master: {}'.format(
                    event.raw))
            elif message == 'registered':
                framework_id = event.raw['framework_id']['value']
                MesosClusterRepository.save(self.mesos_address, framework_id)
            else:
                log.warning('Unknown type of control event: {}'.format(event))

        elif event.kind == 'task':
            if not hasattr(event, 'task_id'):
                log.warning('Task event missing task_id: {}'.format(event))
                return
            if event.task_id not in self.tasks:
                log.warning(
                    'Received event for unknown task {}: {}'.format(
                        event.task_id,
                        event,
                    ), )
                return
            task = self.tasks[event.task_id]
            task.handle_event(event)
            if task.is_done:
                del self.tasks[event.task_id]
        else:
            log.warning('Unknown type of event: {}'.format(event))

    def stop(self, fail_tasks=False):
        self.framework_id = None
        if self.runner:
            self.runner.stop()

        # Clear message queue
        if self.deferred:
            self.deferred.cancel()
            self.deferred = None
        self.queue = PyDeferredQueue()

        if fail_tasks:
            for key, task in list(self.tasks.items()):
                task.exited(None)
                del self.tasks[key]

    def kill(self, task_id):
        return self.runner.kill(task_id)
Ejemplo n.º 17
0
class MesosCluster:
    def __init__(
        self,
        mesos_address,
        mesos_master_port=None,
        secret=None,
        principal=None,
        mesos_role=None,
        framework_id=None,
        enabled=True,
        default_volumes=None,
        dockercfg_location=None,
        offer_timeout=None,
    ):
        self.mesos_address = mesos_address
        self.mesos_master_port = mesos_master_port
        self.secret = secret
        self.principal = principal
        self.mesos_role = mesos_role
        self.enabled = enabled
        self.default_volumes = default_volumes or []
        self.dockercfg_location = dockercfg_location
        self.offer_timeout = offer_timeout
        self.framework_id = framework_id

        self.processor = TaskProcessor()
        self.queue = PyDeferredQueue()
        self.deferred = None
        self.runner = None
        self.tasks = {}

        self.processor.load_plugin(
            provider_module='task_processing.plugins.mesos'
        )
        self.connect()

    def set_enabled(self, is_enabled):
        self.enabled = is_enabled
        if is_enabled:
            self.connect()
        else:
            self.stop(fail_tasks=True)

    def configure_tasks(
        self,
        default_volumes,
        dockercfg_location,
        offer_timeout,
    ):
        self.default_volumes = default_volumes
        self.dockercfg_location = dockercfg_location
        self.offer_timeout = offer_timeout

    def connect(self):
        self.runner = self.get_runner(self.mesos_address, self.queue)
        self.handle_next_event()

    def handle_next_event(self, deferred_result=None):
        if self.deferred and not self.deferred.called:
            log.warning(
                'Already have handlers waiting for next event in queue, '
                'not adding more'
            )
            return
        self.deferred = self.queue.get()
        self.deferred.addCallback(self._process_event)
        self.deferred.addCallback(self.handle_next_event)
        self.deferred.addErrback(logError)
        self.deferred.addErrback(self.handle_next_event)

    def _check_connection(self):
        if self.runner.stopping:
            # Last framework was terminated for some reason, re-connect.
            log.info('Last framework stopped, re-connecting')
            self.connect()
        elif self.deferred.called:
            # Just in case callbacks are missing, re-add.
            self.handle_next_event()

    def submit(self, task):
        if not task:
            return

        if not self.enabled:
            task.log.info('Task failed to start, Mesos is disabled.')
            task.exited(1)
            return
        self._check_connection()

        mesos_task_id = task.get_mesos_id()
        self.tasks[mesos_task_id] = task
        self.runner.run(task.get_config())
        log.info(
            'Submitting task {} to {}'.format(
                mesos_task_id,
                self.mesos_address,
            ),
        )
        task.report_resources()

    def recover(self, task):
        if not task:
            return

        if not self.enabled:
            task.log.info('Could not recover task, Mesos is disabled.')
            task.exited(None)
            return
        self._check_connection()

        mesos_task_id = task.get_mesos_id()
        self.tasks[mesos_task_id] = task
        task.log.info('Reconciling state for this task from Mesos')
        task.started()
        self.runner.reconcile(task.get_config())
        task.report_resources()

    def create_task(
        self,
        action_run_id,
        command,
        cpus,
        mem,
        disk,
        constraints,
        docker_image,
        docker_parameters,
        env,
        extra_volumes,
        serializer,
        task_id=None,
    ):
        if not self.runner:
            return None

        uris = [self.dockercfg_location] if self.dockercfg_location else []
        volumes = combine_volumes(self.default_volumes, extra_volumes)
        task_kwargs = {
            'name': action_run_id,
            'cmd': command,
            'cpus': cpus,
            'mem': mem,
            'disk': disk,
            'constraints': constraints,
            'image': docker_image,
            'docker_parameters': docker_parameters,
            'environment': env,
            'volumes': volumes,
            'uris': uris,
            'offer_timeout': self.offer_timeout,
        }
        task_config = self.runner.TASK_CONFIG_INTERFACE(**task_kwargs)

        if task_id is not None:
            try:
                task_config = task_config.set_task_id(task_id)
            except ValueError:
                log.error(f'Invalid {task_id} for {action_run_id}')
                return

        return MesosTask(action_run_id, task_config, serializer)

    def get_runner(self, mesos_address, queue):
        if not self.enabled:
            log.info('Mesos is disabled, not creating a framework.')
            return None

        if self.runner and not self.runner.stopping:
            log.info('Already have a running framework, not creating one.')
            return self.runner

        framework_name = 'tron-{}'.format(socket.gethostname())
        executor = self.processor.executor_from_config(
            provider='mesos_task',
            provider_config={
                'secret':
                    self.secret,
                'principal':
                    self.principal,
                'mesos_address':
                    get_mesos_leader(mesos_address, self.mesos_master_port),
                'role':
                    self.mesos_role,
                'framework_name':
                    framework_name,
                'framework_id':
                    self.framework_id,
                'failover':
                    True,
            }
        )

        def log_output(task_id, message, stream):
            logger = logging.getLogger(
                '{}.{}.{}'.format(
                    TASK_OUTPUT_LOGGER,
                    task_id,
                    stream,
                )
            )
            logger.info(message)

        logging_executor = self.processor.executor_from_config(
            provider='logging',
            provider_config={
                'downstream_executor': executor,
                'handler': log_output,
                'format_string': '{line}',
            },
        )
        return Subscription(logging_executor, queue)

    def _process_event(self, event):
        if event.kind == 'control':
            message = getattr(event, 'message', None)
            if message == 'stop':
                # Framework has been removed, stop it.
                log.warning('Framework has been stopped: {}'.format(event.raw))
                self.stop()
                MesosClusterRepository.remove(self.mesos_address)
            elif message == 'unknown':
                log.warning(
                    'Unknown error from Mesos master: {}'.format(event.raw)
                )
            elif message == 'registered':
                framework_id = event.raw['framework_id']['value']
                MesosClusterRepository.save(self.mesos_address, framework_id)
            else:
                log.warning('Unknown type of control event: {}'.format(event))

        elif event.kind == 'task':
            if not hasattr(event, 'task_id'):
                log.warning('Task event missing task_id: {}'.format(event))
                return
            if event.task_id not in self.tasks:
                log.warning(
                    'Received event for unknown task {}: {}'.format(
                        event.task_id,
                        event,
                    ),
                )
                return
            task = self.tasks[event.task_id]
            task.handle_event(event)
            if task.is_done:
                del self.tasks[event.task_id]
        else:
            log.warning('Unknown type of event: {}'.format(event))

    def stop(self, fail_tasks=False):
        self.framework_id = None
        if self.runner:
            self.runner.stop()

        # Clear message queue
        if self.deferred:
            self.deferred.cancel()
            self.deferred = None
        self.queue = PyDeferredQueue()

        if fail_tasks:
            for key, task in list(self.tasks.items()):
                task.exited(None)
                del self.tasks[key]

    def kill(self, task_id):
        return self.runner.kill(task_id)