def test__acquire_release_host_offers(): resource_constraint = v0hostmgr.ResourceConstraint( minimum=task.ResourceConfig(cpuLimit=3.0)) host_filter = v0hostmgr.HostFilter( resourceConstraint=resource_constraint, quantity=v0hostmgr.QuantityControl(maxHosts=2), ) request = v0hostmgr.AcquireHostOffersRequest(filter=host_filter) client = with_private_stubs(Client()) resp = client.hostmgr_svc.AcquireHostOffers( request, metadata=client.hostmgr_metadata, timeout=20) # max hosts is 2, we should expect 2 host offers assert len(resp.hostOffers) == 2 for offer in resp.hostOffers: assert (offer.hostname in MESOS_AGENTS) # release offers to hostmgr resp = client.hostmgr_svc.ReleaseHostOffers( request=v0hostmgr.ReleaseHostOffersRequest(hostOffers=resp.hostOffers), metadata=client.hostmgr_metadata, timeout=20) assert resp.HasField("error") is False
def test__cluster_capacity(): # get cluster capacity client = with_private_stubs(Client()) resp = client.hostmgr_svc.ClusterCapacity( request=v0hostmgr.ClusterCapacityRequest(), metadata=client.hostmgr_metadata, timeout=20) assert resp.HasField("error") is False # check capacity for resource in resp.physicalResources: assert resource.kind in ['cpu', 'gpu', 'memory', 'disk', 'fd'] if resource.kind == 'cpu': assert resource.capacity == 12.0 # 4cpu * 3 agents if resource.kind == 'memory': assert resource.capacity == 6144.0 # 2048Mb * 3 agents
def test_task_killed_in_ready_succeeds_when_re_enqueued(placement_engines): # Tests that a if task is deleted which is in READY state in resource # manager and if is re-enqueued succeeds. # stop the placement engines to keep the tasks in READY state placement_engines.stop() # decorate the client to add peloton private API stubs c = with_private_stubs(Client()) # create long running job with 2 instances long_running_job = Job( job_file='long_running_job.yaml', options=[ with_instance_count(2), ], client=c, ) long_running_job.create() long_running_job.wait_for_state(goal_state='PENDING') task = long_running_job.get_task(0) # wait for task to reach READY task.wait_for_pending_state(goal_state='READY') # kill the task task.stop() # re-enqueue the task task.start() # gentlemen, start your (placement) engines placement_engines.start() def wait_for_instance_to_run(): return long_running_job.get_task(0).state_str == 'RUNNING' long_running_job.wait_for_condition(wait_for_instance_to_run)
def test__acquire_return_offers_errors(): resource_constraint = v0hostmgr.ResourceConstraint( minimum=task.ResourceConfig(cpuLimit=14.0)) host_filter = v0hostmgr.HostFilter(resourceConstraint=resource_constraint) request = v0hostmgr.AcquireHostOffersRequest(filter=host_filter) # decorate the client to add peloton private API stubs client = with_private_stubs(Client()) # ask is 14 cpus, so no hosts should match this resp = client.hostmgr_svc.AcquireHostOffers( request, metadata=client.hostmgr_metadata, timeout=20) assert len(resp.hostOffers) == 0 # release offers to hostmgr with a invalid offer ID resp = client.hostmgr_svc.ReleaseHostOffers( request=v0hostmgr.ReleaseHostOffersRequest(hostOffers=[ v0hostmgr.HostOffer(id=peloton.HostOfferID(value="invalid_id")) ]), metadata=client.hostmgr_metadata, timeout=20) assert resp.error is not None
def test__tasks_reserve_execution(hostreservepool, peloton_client): p_job_median = Job( client=peloton_client, job_file='test_hostreservation_job_median.yaml', pool=hostreservepool, config=IntegrationTestConfig( max_retry_attempts=100, sleep_time_sec=1), ) p_job_median.create() p_job_median.wait_for_state(goal_state='RUNNING') # we should have all 3 tasks in running state def all_running(): return all(t.state == task.RUNNING for t in p_job_median.get_tasks().values()) p_job_median.wait_for_condition(all_running) # decorate the client to add peloton private API stubs client = with_private_stubs(peloton_client) p_job_large = Job( job_file='test_hostreservation_job_large.yaml', pool=hostreservepool, config=IntegrationTestConfig( sleep_time_sec=1, max_retry_attempts=300), options=[with_instance_count(1)], client=client, ) p_job_large.create() p_job_large.wait_for_state(goal_state='PENDING') request = hostmgr.GetHostsByQueryRequest() # task should get into reserved state and RUNNING state t1 = p_job_large.get_task(0) t1.wait_for_pending_state(goal_state="RESERVED") # the task is running on reserved host def get_reserved_host(): resp = client.hostmgr_svc.GetHostsByQuery( request, metadata=p_job_large.client.hostmgr_metadata, timeout=p_job_large.config.rpc_timeout_sec,) for h in resp.hosts: if h.status == 'reserved': return h.hostname return '' def is_reserved(): return get_reserved_host() != '' p_job_large.wait_for_condition(is_reserved) reserved_host = get_reserved_host() t1.wait_for_pending_state(goal_state="RUNNING") assert reserved_host == t1.get_info().runtime.host # p_job_large should succeed p_job_large.wait_for_state() # no host is in reserved state response = client.hostmgr_svc.GetHostsByQuery( request, metadata=p_job_large.client.hostmgr_metadata, timeout=p_job_large.config.rpc_timeout_sec,) for host in response.hosts: assert host.status != 'reserved' kill_jobs([p_job_median, p_job_large])
def client(self): return with_private_stubs(Client())
def test__launch_kill(): client = with_private_stubs(Client()) # acquire 1 host offer resource_constraint = v0hostmgr.ResourceConstraint( minimum=task.ResourceConfig(cpuLimit=3.0)) host_filter = v0hostmgr.HostFilter( resourceConstraint=resource_constraint, quantity=v0hostmgr.QuantityControl(maxHosts=1), ) request = v0hostmgr.AcquireHostOffersRequest(filter=host_filter, ) resp = client.hostmgr_svc.AcquireHostOffers( request, metadata=client.hostmgr_metadata, timeout=20) assert len(resp.hostOffers) == 1 # launch a test task using this offer cmd = "echo 'succeeded instance task' & sleep 100" tc = task.TaskConfig( command=mesos.CommandInfo(shell=True, value=cmd), name="task_name", resource=task.ResourceConfig(cpuLimit=1.0), ) tid = mesos.TaskID(value=str(uuid.uuid4()) + '-1-1') t = v0hostmgr.LaunchableTask( taskId=tid, config=tc, ) # Test 1 # launch task using invalid offer req = v0hostmgr.LaunchTasksRequest( hostname=resp.hostOffers[0].hostname, agentId=resp.hostOffers[0].agentId, tasks=[t], id=peloton.HostOfferID(value=str(uuid.uuid4()))) try: resp = client.hostmgr_svc.LaunchTasks(req, metadata=client.hostmgr_metadata, timeout=20) assert False, 'LaunchTasks should have failed' except: pass # Test 2 # launch task using valid offer req = v0hostmgr.LaunchTasksRequest(hostname=resp.hostOffers[0].hostname, agentId=resp.hostOffers[0].agentId, tasks=[t], id=resp.hostOffers[0].id) resp = client.hostmgr_svc.LaunchTasks(req, metadata=client.hostmgr_metadata, timeout=20) assert resp.HasField("error") is False # Test 3 # kill with empty TaskIDs list resp = client.hostmgr_svc.KillTasks(v0hostmgr.KillTasksRequest(taskIds=[]), metadata=client.hostmgr_metadata, timeout=20) assert resp.HasField("error") is True # Test 4 # kill valid TaskID resp = client.hostmgr_svc.KillTasks( v0hostmgr.KillTasksRequest(taskIds=[tid]), metadata=client.hostmgr_metadata, timeout=20) assert resp.HasField("error") is False
def test__hostpool_capacity(peloton_client): client = with_private_stubs(peloton_client) # Check capacity of default pool. resp = client.hostmgr_svc.GetHostPoolCapacity( request=v0hostmgr.GetHostPoolCapacityRequest(), metadata=client.hostmgr_metadata, timeout=20) assert len(resp.pools) == 1 assert resp.pools[0].poolName == "default" assert len(resp.pools[0].physicalCapacity) == 4 assert len(resp.pools[0].slackCapacity) == 4 for resource in resp.pools[0].physicalCapacity: assert resource.kind in ['cpu', 'gpu', 'memory', 'disk'] if resource.kind == 'cpu': assert resource.capacity == 12.0 # 4cpu * 3 agents if resource.kind == 'memory': assert resource.capacity == 6144.0 # 2048Mb * 3 agents for resource in resp.pools[0].slackCapacity: assert resource.kind in ['cpu', 'gpu', 'memory', 'disk'] if resource.kind == 'cpu': assert resource.capacity == 12.0 # 4cpu * 3 agents # Create a host-pool and move 1 host to it. ensure_host_pool("capacity-test", 1, client=peloton_client) resp = client.hostmgr_svc.GetHostPoolCapacity( request=v0hostmgr.GetHostPoolCapacityRequest(), metadata=client.hostmgr_metadata, timeout=20) assert len(resp.pools) == 2 for pool in resp.pools: assert len(pool.physicalCapacity) == 4 assert len(pool.slackCapacity) == 4 if pool.poolName == "default": for resource in pool.physicalCapacity: assert resource.kind in ['cpu', 'gpu', 'memory', 'disk'] if resource.kind == 'cpu': assert resource.capacity == 8.0 # 4cpu * 2 agents if resource.kind == 'memory': assert resource.capacity == 4096.0 # 2048Mb * 2 agents for resource in pool.slackCapacity: assert resource.kind in ['cpu', 'gpu', 'memory', 'disk'] if resource.kind == 'cpu': assert resource.capacity == 8.0 # 4cpu * 2 agents elif pool.poolName == "capacity-test": for resource in pool.physicalCapacity: assert resource.kind in ['cpu', 'gpu', 'memory', 'disk'] if resource.kind == 'cpu': assert resource.capacity == 4.0 # 4cpu * 1 agent if resource.kind == 'memory': assert resource.capacity == 2048.0 # 2048Mb * 1 agent for resource in pool.slackCapacity: assert resource.kind in ['cpu', 'gpu', 'memory', 'disk'] if resource.kind == 'cpu': assert resource.capacity == 4.0 # 4cpu * 1 agent else: assert False, "Unexpected pool %s" % pool.poolName cleanup_other_host_pools([], client=peloton_client)