Ejemplo n.º 1
0
def test__stop_start_partial_tests_with_multiple_ranges(test_job):
    test_job.create()
    test_job.wait_for_state(goal_state='RUNNING')

    range1 = task_pb2.InstanceRange(to=1)
    setattr(range1, 'from', 0)
    range2 = task_pb2.InstanceRange(to=2)
    setattr(range2, 'from', 1)

    def wait_for_instance_to_stop():
        return (test_job.get_task(0).state_str == 'KILLED'
                and test_job.get_task(1).state_str == 'KILLED')

    test_job.stop(ranges=[range1, range2])
    test_job.wait_for_condition(wait_for_instance_to_stop)

    def wait_for_instance_to_run():
        return (test_job.get_task(0).state_str == 'RUNNING'
                and test_job.get_task(1).state_str == 'RUNNING')

    test_job.start(ranges=[range1, range2])
    test_job.wait_for_condition(wait_for_instance_to_run)

    test_job.stop()
    test_job.wait_for_state(goal_state='KILLED')
Ejemplo n.º 2
0
def test__stop_start_partial_tests_with_multiple_ranges(long_running_job):
    long_running_job.create()
    long_running_job.wait_for_state(goal_state="RUNNING")

    range1 = task_pb2.InstanceRange(to=1)
    setattr(range1, "from", 0)
    range2 = task_pb2.InstanceRange(to=2)
    setattr(range2, "from", 1)

    def wait_for_instance_to_stop():
        return (long_running_job.get_task(0).state_str == "KILLED"
                and long_running_job.get_task(1).state_str == "KILLED")

    long_running_job.stop(ranges=[range1, range2])
    long_running_job.wait_for_condition(wait_for_instance_to_stop)

    def wait_for_instance_to_run():
        return (long_running_job.get_task(0).state_str == "RUNNING"
                and long_running_job.get_task(1).state_str == "RUNNING")

    long_running_job.start(ranges=[range1, range2])
    long_running_job.wait_for_condition(wait_for_instance_to_run)

    long_running_job.stop()
    long_running_job.wait_for_state(goal_state="KILLED")
Ejemplo n.º 3
0
    def stop_task(self, job_id, instance_id):
        """
        param job_id: id of the job
        param instance_id: instance id of the task to stop

        type job_id: str
        type instance_id: int

        rtype: task.StopResponse
        """
        rng = task.InstanceRange(to=instance_id + 1)
        setattr(rng, "from", instance_id)
        request = task.StopRequest(
            jobId=peloton.JobID(value=job_id), ranges=[rng]
        )
        try:
            print_okblue("Stopping task %d of Job %s" % (instance_id, job_id))
            resp = self.client.task_svc.Stop(
                request,
                metadata=self.client.jobmgr_metadata,
                timeout=default_timeout,
            )
            return resp
        except Exception as e:
            print_fail("Exception calling Stop Tasks :%s" % str(e))
            raise
Ejemplo n.º 4
0
def test__stop_start_tasks_when_mesos_master_down_and_jobmgr_restarts(
        test_job, mesos_master, jobmgr):
    test_job.create()
    test_job.wait_for_state(goal_state='RUNNING')

    range = task_pb2.InstanceRange(to=1)
    setattr(range, 'from', 0)

    def wait_for_instance_to_stop():
        return test_job.get_task(0).state_str == 'KILLED'

    mesos_master.stop()
    test_job.stop(ranges=[range])
    jobmgr.restart()
    mesos_master.start()
    test_job.wait_for_condition(wait_for_instance_to_stop)

    def wait_for_instance_to_run():
        return test_job.get_task(0).state_str == 'RUNNING'

    mesos_master.stop()
    test_job.start(ranges=[range])
    jobmgr.restart()
    mesos_master.start()
    test_job.wait_for_condition(wait_for_instance_to_run)

    mesos_master.stop()
    test_job.stop()
    jobmgr.restart()
    mesos_master.start()
    test_job.wait_for_state(goal_state='KILLED')
Ejemplo n.º 5
0
def test__update_reduce_instances_stopped_tasks(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()
    assert len(old_pod_infos) == 3
    # first increase instances
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC)
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    assert len(new_pod_infos) == 5
    # now stop last 2 tasks
    ranges = task_pb2.InstanceRange(to=5)
    setattr(ranges, "from", 3)
    stateless_job.stop(ranges=[ranges])
    # now reduce instance count
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    assert len(new_pod_infos) == 3
Ejemplo n.º 6
0
def test__stop_start_tasks_when_mesos_master_down_and_jobmgr_restarts(
        long_running_job, mesos_master, jobmgr):
    long_running_job.create()
    long_running_job.wait_for_state(goal_state="RUNNING")

    range = task_pb2.InstanceRange(to=1)
    setattr(range, "from", 0)

    def wait_for_instance_to_stop():
        return long_running_job.get_task(0).state_str == "KILLED"

    mesos_master.stop()
    long_running_job.stop(ranges=[range])
    jobmgr.restart()
    mesos_master.start()
    long_running_job.wait_for_condition(wait_for_instance_to_stop)

    def wait_for_instance_to_run():
        return long_running_job.get_task(0).state_str == "RUNNING"

    mesos_master.stop()
    long_running_job.start(ranges=[range])
    jobmgr.restart()
    mesos_master.start()
    long_running_job.wait_for_condition(wait_for_instance_to_run)

    mesos_master.stop()
    long_running_job.stop()
    jobmgr.restart()
    mesos_master.start()
    long_running_job.wait_for_terminated()
Ejemplo n.º 7
0
def test__stop_start_tasks_when_mesos_master_down_kills_tasks_when_started(
        test_job, mesos_master):
    test_job.create()
    test_job.wait_for_state(goal_state="RUNNING")

    range = task_pb2.InstanceRange(to=1)
    setattr(range, "from", 0)

    def wait_for_instance_to_stop():
        return test_job.get_task(0).state_str == "KILLED"

    mesos_master.stop()
    test_job.stop(ranges=[range])
    mesos_master.start()
    test_job.wait_for_condition(wait_for_instance_to_stop)

    def wait_for_instance_to_run():
        return test_job.get_task(0).state_str == "RUNNING"

    mesos_master.stop()
    test_job.start(ranges=[range])
    mesos_master.start()
    test_job.wait_for_condition(wait_for_instance_to_run)

    mesos_master.stop()
    test_job.stop()
    mesos_master.start()
    test_job.wait_for_terminated()
Ejemplo n.º 8
0
    def test__stop_start_tasks_when_mesos_master_down_and_jobmgr_restarts_stateless_job(
            self, failure_tester):
        """
        Start and stop some tasks in a stateless job while mesos master is not running
        and job manager restarts, verify that those tasks start and stop as expected
        """
        # Step 1: Start a stateless job.
        stateless_job = failure_tester.stateless_job()
        stateless_job.create()
        stateless_job.wait_for_state(goal_state="RUNNING")

        range = task_pb2.InstanceRange(to=1)
        setattr(range, "from", 0)

        # Step 2: Stop a subset of job instances when mesos master is down.
        assert 0 != failure_tester.fw.stop(failure_tester.mesos_master)
        stateless_job.stop(ranges=[range])

        # Step 3: Restart job manager.
        leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr)
        assert leader
        assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader)
        failure_tester.reset_client()
        stateless_job.client = failure_tester.client

        # Step 4: Start mesos master and wait for the instances to be stopped.
        assert 0 != failure_tester.fw.start(failure_tester.mesos_master)

        def wait_for_instance_to_stop():
            return stateless_job.get_task(0).state_str == "KILLED"

        stateless_job.wait_for_condition(wait_for_instance_to_stop)

        # Step 5: Start the same subset of instances when mesos master is down.
        assert 0 != failure_tester.fw.stop(failure_tester.mesos_master)
        stateless_job.start(ranges=[range])

        # Step 6: Restart job manager.
        leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr)
        assert leader
        assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader)
        failure_tester.reset_client()
        stateless_job.client = failure_tester.client

        # Step 7: Start mesos master and wait for the instances to transit to RUNNING.
        assert 0 != failure_tester.fw.start(failure_tester.mesos_master)

        def wait_for_instance_to_run():
            return stateless_job.get_task(0).state_str == "RUNNING"

        stateless_job.wait_for_condition(wait_for_instance_to_run)
Ejemplo n.º 9
0
    def test__stop_start_tasks_when_mesos_master_down_and_jobmgr_restarts_batch_job(
            self, failure_tester):
        """
        Start and stop some tasks in a job while mesos master is not running
        and job manager restarts, verify that those tasks start and stop as expected
        """
        # Step 1: start the job
        long_running_job = failure_tester.job(job_file="long_running_job.yaml")
        long_running_job.create()

        range = task_pb2.InstanceRange(to=1)
        setattr(range, "from", 0)

        # Step 2: stop some tasks in the job while mesos master is not running
        assert 0 != failure_tester.fw.stop(failure_tester.mesos_master)
        long_running_job.stop(ranges=[range])

        leader1 = failure_tester.fw.get_leader_info(failure_tester.jobmgr)
        assert leader1
        assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader1)
        failure_tester.reset_client()
        long_running_job.client = failure_tester.client

        assert 0 != failure_tester.fw.start(failure_tester.mesos_master)

        def wait_for_instance_to_stop():
            return long_running_job.get_task(0).state_str == "KILLED"

        long_running_job.wait_for_condition(wait_for_instance_to_stop)

        # Step 3: start the same tasks that were stopped while mesos master is not running
        assert 0 != failure_tester.fw.stop(failure_tester.mesos_master)
        long_running_job.start(ranges=[range])

        leader2 = failure_tester.fw.get_leader_info(failure_tester.jobmgr)
        assert leader2
        assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader2)
        failure_tester.reset_client()
        long_running_job.client = failure_tester.client

        assert 0 != failure_tester.fw.start(failure_tester.mesos_master)

        def wait_for_instance_to_run():
            return long_running_job.get_task(0).state_str == "RUNNING"

        long_running_job.wait_for_condition(wait_for_instance_to_run)
Ejemplo n.º 10
0
def test__stop_start_tasks_when_mesos_master_down_and_jobmgr_restarts(
        stateless_job, mesos_master, jobmgr):
    """
    1. Create stateless job.
    2. Wait for job state RUNNING.
    3. Stop a subset of job instances when mesos master is down.
    4. Restart job manager.
    5. Start mesos master and wait for the instances to be stopped.
    6. Start the same subset of instances when mesos master is down.
    7. Restart job manager.
    8. Start mesos master and wait for the instances to transit to RUNNING.
    9. Stop the job when mesos master is down.
    10. Restart job manager.
    11. Start mesos master and wait for the job to terminate
    """
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")

    range = task_pb2.InstanceRange(to=1)
    setattr(range, "from", 0)

    def wait_for_instance_to_stop():
        return stateless_job.get_task(0).state_str == "KILLED"

    mesos_master.stop()
    stateless_job.stop(ranges=[range])
    jobmgr.restart()
    mesos_master.start()
    stateless_job.wait_for_condition(wait_for_instance_to_stop)

    def wait_for_instance_to_run():
        return stateless_job.get_task(0).state_str == "RUNNING"

    mesos_master.stop()
    stateless_job.start(ranges=[range])
    jobmgr.restart()
    mesos_master.start()
    stateless_job.wait_for_condition(wait_for_instance_to_run)

    mesos_master.stop()
    stateless_job.stop()
    jobmgr.restart()
    mesos_master.start()
    stateless_job.wait_for_terminated()
Ejemplo n.º 11
0
def test__stop_start_pod_on_sla_violated_job(stateless_job):
    """
    1. Create a stateless job(instance_count=5) with host-limit-1 constraint and
       MaximumUnavailableInstances=1. Since there are only 3 UP hosts, 2 of
       the instances will not get placed (hence unavailable).
    2. Kill one of the running instances (say i). Instance should get killed.
    3. Start instance i. Instance i should transit to PENDING (due to host
       limit 1 constraint, instance won't get placed).
    """
    job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml')
    json_format.ParseDict(job_spec_dump, stateless_job.job_spec)
    stateless_job.job_spec.instance_count = 5
    stateless_job.create()
    stateless_job.wait_for_all_pods_running(num_pods=3)

    test_instance = None
    for i in range(0, stateless_job.job_spec.instance_count):
        if stateless_job.get_pod_status(i).state == pod_pb2.POD_STATE_RUNNING:
            test_instance = i
            break

    print test_instance
    assert not test_instance == None

    ranges = task_pb2.InstanceRange(to=test_instance + 1)
    setattr(ranges, "from", test_instance)
    stateless_job.stop(ranges=[ranges])

    def instance_killed():
        return stateless_job.get_pod_status(
            test_instance).state == pod_pb2.POD_STATE_KILLED

    stateless_job.wait_for_condition(instance_killed)

    stateless_job.start(ranges=[ranges])

    def instance_pending():
        return stateless_job.get_pod_status(
            test_instance).state == pod_pb2.POD_STATE_PENDING

    stateless_job.wait_for_condition(instance_pending)
Ejemplo n.º 12
0
def test__stop_start_partial_tests_with_single_range(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")

    range = task_pb2.InstanceRange(to=1)
    setattr(range, "from", 0)

    def wait_for_instance_to_stop():
        return stateless_job.get_task(0).state_str == "KILLED"

    stateless_job.stop(ranges=[range])
    stateless_job.wait_for_condition(wait_for_instance_to_stop)

    def wait_for_instance_to_run():
        return stateless_job.get_task(0).state_str == "RUNNING"

    stateless_job.start(ranges=[range])
    stateless_job.wait_for_condition(wait_for_instance_to_run)

    stateless_job.stop()
    stateless_job.wait_for_state(goal_state="KILLED")
Ejemplo n.º 13
0
def test__stop_start_partial_tests_with_single_range(test_job):
    test_job.create()
    test_job.wait_for_state(goal_state='RUNNING')

    range = task_pb2.InstanceRange(to=1)
    setattr(range, 'from', 0)

    def wait_for_instance_to_stop():
        return test_job.get_task(0).state_str == 'KILLED'

    test_job.stop(ranges=[range])
    test_job.wait_for_condition(wait_for_instance_to_stop)

    def wait_for_instance_to_run():
        return test_job.get_task(0).state_str == 'RUNNING'

    test_job.start(ranges=[range])
    test_job.wait_for_condition(wait_for_instance_to_run)

    test_job.stop()
    test_job.wait_for_state(goal_state='KILLED')
Ejemplo n.º 14
0
    def test__stop_start_tasks_when_mesos_master_down_kills_tasks_when_started(self, failure_tester):
        '''
        Start Mesos master after creating a long running job and stopping the job
        and verify that the job is still running
        1. Create stateless job.
        2. Wait for job state RUNNING.
        3. Stop a subset of job instances when mesos master is down.
        4. Start mesos master and wait for the instances to be stopped.
        5. Start the same subset of instances when mesos master is down.
        6. Start mesos master and wait for the instances to transit to RUNNING.
        7. Stop the job when mesos master is down.
        8. Start mesos master and wait for the job to terminate
        '''
        long_running_job = failure_tester.job(job_file="long_running_job.yaml")
        long_running_job.create()
        long_running_job.wait_for_state(goal_state="RUNNING")

        range = task_pb2.InstanceRange(to=1)
        setattr(range, "from", 0)

        def wait_for_instance_to_stop():
            return long_running_job.get_task(0).state_str == "KILLED"

        assert 0 != failure_tester.fw.stop(failure_tester.mesos_master)
        long_running_job.stop(ranges=[range])
        assert 0 != failure_tester.fw.start(failure_tester.mesos_master)
        long_running_job.wait_for_condition(wait_for_instance_to_stop)

        def wait_for_instance_to_run():
            return long_running_job.get_task(0).state_str == "RUNNING"

        assert 0 != failure_tester.fw.stop(failure_tester.mesos_master)
        long_running_job.start(ranges=[range])
        assert 0 != failure_tester.fw.start(failure_tester.mesos_master)
        long_running_job.wait_for_condition(wait_for_instance_to_run)

        assert 0 != failure_tester.fw.stop(failure_tester.mesos_master)
        long_running_job.stop()
        assert 0 != failure_tester.fw.start(failure_tester.mesos_master)
        long_running_job.wait_for_terminated()
Ejemplo n.º 15
0
 def _get_range(self):
     _range = task.InstanceRange(to=self.instance_id + 1)
     # 'from' a reserved keyword so we have to do this
     setattr(_range, 'from', self.instance_id)
     return [_range]