def create_pod_config(self, sleep_time, dynamic_factor, host_limit_1=False): container_spec = pod.ContainerSpec( resource=pod.ResourceSpec( cpu_limit=0.1, mem_limit_mb=32, disk_limit_mb=32, ), command=mesos.CommandInfo( shell=True, value="echo %s && sleep %s" % (str(dynamic_factor), str(sleep_time)), ), ) instance_label = v1alpha_peloton.Label( key="peloton/instance", value="instance-label" ) host_limit_1_constraint = None if host_limit_1: host_limit_1_constraint = pod.Constraint( type=1, # Label constraint label_constraint=pod.LabelConstraint( kind=1, # Label condition=2, # Equal requirement=0, label=instance_label, ), ) containers = [container_spec] return pod.PodSpec(containers=containers, labels=[instance_label], constraint=host_limit_1_constraint)
def test__update_with_host_maintenance_and_agent_down(stateless_job, maintenance): """ 1. Create a large stateless job (that take up more than two-thirds of the cluster resources) with MaximumUnavailableInstances=2. 2. Start host maintenance on one of the hosts (say A) having pods of the job. MaximumUnavailableInstances=2 ensures that not more than 2 pods are unavailable due to host maintenance at a time. 3. Take down another host which has pods running on it. This will TASK_LOST to be sent for all pods on the host after 75 seconds. 4. Start an update to modify the instance spec of one of the pods. 5. Since TASK_LOST would cause the job SLA to be violated, instances on the host A should not be killed once LOST event is received. Verify that host A does not transition to DOWN. """ stateless_job.job_spec.instance_count = 30 stateless_job.job_spec.default_spec.containers[0].resource.cpu_limit = 0.3 stateless_job.job_spec.sla.maximum_unavailable_instances = 2 stateless_job.create() stateless_job.wait_for_all_pods_running() hosts = [h.hostname for h in query_hosts([]).host_infos] host_to_task_count = get_host_to_task_count(hosts, stateless_job) sorted_hosts = [ t[0] for t in sorted(host_to_task_count.items(), key=operator.itemgetter(1), reverse=True) ] # Pick a host that has pods running on it to start maintenance on it. test_host = sorted_hosts[0] # pick another host which has pods of the job to take down host_container = get_container([sorted_hosts[1]]) try: host_container.stop() maintenance["start"]([test_host]) stateless_job.job_spec.instance_spec[10].containers.extend([ pod_pb2.ContainerSpec(resource=pod_pb2.ResourceSpec( disk_limit_mb=20)) ]) update = StatelessUpdate(stateless_job, updated_job_spec=stateless_job.job_spec, batch_size=0) update.create() update.wait_for_state(goal_state="SUCCEEDED") stateless_job.stop() wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN) assert False, 'Host should not transition to DOWN' except: assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING) pass finally: host_container.start()
def create_pod_config(self, sleep_time, dynamic_factor): container_spec = pod.ContainerSpec( resource=pod.ResourceSpec( cpu_limit=0.1, mem_limit_mb=32, disk_limit_mb=32 ), command=mesos.CommandInfo( shell=True, value="echo %s && sleep %s" % (str(dynamic_factor), str(sleep_time)), ), ) containers = [container_spec] return pod.PodSpec(containers=containers)
def new_resource_constraint(self, cpu): return v1hostmgr.ResourceConstraint( minimum=pod.ResourceSpec(cpu_limit=cpu), )