def get_task_status(client, job_key, instances=None):
    """Querying current task status for job.

    Args:
        client: aurora client object
        job_key: aurora JobKey struct specifying the job to query for
        instances: a list of instance ids to wait for, wait for all instances
            passed as None

    Returns:
        a list of ScheduleStatus enum representing the state for all tasks
    """
    res = client.get_tasks_without_configs(api.TaskQuery(jobKeys=[job_key]))

    assert res.tasks is not None

    tasks_per_instance = {}
    for t in res.tasks:
        instance_id = t.assignedTask.instanceId
        if instance_id not in tasks_per_instance:
            tasks_per_instance[instance_id] = []

        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        tasks_per_instance[instance_id].append((run_id, t.status))

    # grab task status from latest pod run
    return [
        max(statuses)[1] for iid, statuses in tasks_per_instance.iteritems()
        if instances is None or iid in instances
    ]
def verify_task_config(client, job_key, metadata_dict):
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))

    for t in res.tasks:
        for m in t.assignedTask.task.metadata:
            if m.key in metadata_dict:
                assert m.value == metadata_dict[m.key]
            else:
                assert False, "unexpected metadata {}".format(m)
def get_running_tasks(client, job_key):
    """Calls getTasksWithoutConfigs endpoint to get currently running tasks.

    Args:
        client: aurora client object
        job_key: aurora job key
    """
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    return res.tasks
Esempio n. 4
0
    def perf_aurora_bridge_test_write_path(self):
        """
        - Create a stateless job via AuroraBridge, #tasks = 600 and batch_size = 50
        - Create a healthy update for stateless job, with #tasks = 600 and batch_size = 50
        - Create a unhealthy update with auto-rollback at 200 failed instances.
        - Create pinned instances, where C1: 0-100, C2: 101-300, C3: 301-599
        - Deploy C4 for all tasks, and trigger manual rollback.
        - Once manual rollback completes, validate the correctness and calculate the time taken.
        """
        try:
            start_time = time.time()

            print 'Create Stateless Job of 600 instances and batch size 50'
            step_start_time = time.time()
            start_job_update(
                self.client.aurora_bridge_client,
                get_job_update_request("test_dc_labrat.yaml"),
                "create job",
            )
            print('Time Taken:: %f' % (time.time() - step_start_time))

            print '\nUpdate Stateless Job of 600 instances with healthy config and batch size 50'
            step_start_time = time.time()
            start_job_update(
                self.client.aurora_bridge_client,
                get_job_update_request("test_dc_labrat_update.yaml"),
                "start job update",
            )
            print('Time Taken:: %f' % (time.time() - step_start_time))

            print '\nRollout bad config which will trigger auto-rollback after 200 failed instances'
            step_start_time = time.time()
            resp = self.client.aurora_bridge_client.start_job_update(
                get_job_update_request("test_dc_labrat_bad_config.yaml"),
                "start job update bad config",
            )
            job_update_key = resp.key
            wait_for_rolled_back(self.client.aurora_bridge_client,
                                 job_update_key)
            print('Time Taken:: %f' % (time.time() - step_start_time))

            print '\nRollout Pinned Instance Config (C1) for Instance: 0-99'
            step_start_time = time.time()
            instances = []
            for i in xrange(100):
                instances.append(i)
            pinned_req = get_job_update_request("test_dc_labrat.yaml")
            pinned_req.settings.updateOnlyTheseInstances = set(
                [api.Range(first=i, last=i) for i in instances])
            job_key = start_job_update(
                self.client.aurora_bridge_client,
                pinned_req,
                "update pinned instance req",
            )
            print('Time Taken:: %f' % (time.time() - step_start_time))

            print '\nRollout Pinned Instance Config (C3) for Instance: 301-599'
            step_start_time = time.time()
            instances = []
            for i in xrange(301, 600):
                instances.append(i)
            pinned_req = get_job_update_request("test_dc_labrat_update2.yaml")
            pinned_req.settings.updateOnlyTheseInstances = set(
                [api.Range(first=i, last=i) for i in instances])
            start_job_update(
                self.client.aurora_bridge_client,
                pinned_req,
                "update pinned instance req",
            )
            print('Time Taken:: %f' % (time.time() - step_start_time))

            print '\nRollout update and trigger manual rollback'
            step_start_time = time.time()
            resp = self.client.aurora_bridge_client.start_job_update(
                get_job_update_request("test_dc_labrat.yaml"),
                "start job update with good config",
            )
            job_update_key = resp.key
            time.sleep(100)
            self.client.aurora_bridge_client.rollback_job_update(
                job_update_key)
            wait_for_rolled_back(self.client.aurora_bridge_client,
                                 job_update_key)
            print('Time Taken:: %f' % (time.time() - step_start_time))

            print 'Validate pinned instance configs are set correctly'
            step_start_time = time.time()
            resp = self.client.aurora_bridge_client.get_tasks_without_configs(
                api.TaskQuery(jobKeys={job_key},
                              statuses={api.ScheduleStatus.RUNNING}))
            assert len(resp.tasks) == 600

            print('Time Taken:: %f' % (time.time() - step_start_time))

            elapsed_time = time.time() - start_time
            print('\n\nTotal Time Taken:: %f' % (elapsed_time))

            for t in resp.tasks:
                _, instance_id, _ = t.assignedTask.taskId.rsplit("-", 2)
                if int(instance_id) < 100:
                    for m in t.assignedTask.task.metadata:
                        if m.key == "test_key_1":
                            assert m.value == "test_value_1"
                        elif m.key == "test_key_2":
                            assert m.value == "test_value_2"
                elif int(instance_id) >= 100 and int(instance_id) < 300:
                    if m.key == "test_key_11":
                        assert m.value == "test_value_11"
                    elif m.key == "test_key_22":
                        assert m.value == "test_value_22"
                else:
                    if m.key == "test_key_111":
                        assert m.value == "test_value_111"
                    elif m.key == "test_key_222":
                        assert m.value == "test_value_222"

        except Exception as e:
            print e