Esempio n. 1
0
 async def _find_nodes(self):
     node_infos = {}
     node_list = await self._core_api.list_node()
     # Find all non-AdaptDL pods which are taking up resources and subtract
     # those resources from the available pool. Apparently there's not a
     # more efficient way to get currently available resources in k8s?. We
     # also check if we have reached the pod limit on the node. This number
     # denotes (allocatable pods - Non-terminated pods) on that node.
     pod_list = await self._core_api.list_pod_for_all_namespaces(
         label_selector="!adaptdl/job")
     for node in node_list.items:
         if allowed_taints(node.spec.taints):
             resources = get_node_unrequested(node, pod_list.items)
             if not resources.get("pods"):
                 LOG.warning(f"node {node.metadata.name} "
                             "has no free pods available.")
             node_infos[node.metadata.name] = NodeInfo(resources, False)
     # For cluster autoscaling: to determine if additional nodes would be
     # helpful, add a few "virtual" nodes which only become available in
     # "eta" seconds. Currently, we only consider as many virtual nodes as
     # there are real nodes. We infer each resource to be the maximum amount
     # observed in any real node.
     max_resources = {}
     for node_name in node_infos:
         for key, val in node_infos[node_name].resources.items():
             if key not in max_resources or val > max_resources[key]:
                 max_resources[key] = val
     node_template = NodeInfo(max_resources, True)
     return node_infos, node_template
Esempio n. 2
0
def test_allocate_job():
    nodes = {
        "0": NodeInfo({"gpu": 1, "cpu": 500, "pods": 32}, preemptible=False),
        "1": NodeInfo({"gpu": 2, "cpu": 2000, "pods": 32}, preemptible=False),
        "2": NodeInfo({"gpu": 2, "cpu": 3000, "pods": 32}, preemptible=True),
    }
    perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634,
                             0.0118, 0.00317, 1.14)
    grad_params = GradParams(sqr=0.00136, var=0.000502)
    goodput_fn = GoodputFunction(perf_params, grad_params, 128)
    speedup_fn = SpeedupFunction(goodput_fn, max_batch_size=1280,
                                 atomic_bsz_range=(64, 256))
    now = datetime.now()
    min_replicas = 0
    job_1 = JobInfo({"gpu": 1, "cpu": 500, "pods": 1}, speedup_fn,
                    now + timedelta(minutes=0), min_replicas, max_replicas=1)
    job_2 = JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                    now + timedelta(minutes=1), min_replicas, max_replicas=1)
    job_3 = JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                    now + timedelta(minutes=1), 2, max_replicas=2)
    job_4 = JobInfo({"gpu": 1, "cpu": 2000, "pods": 1}, speedup_fn,
                    now + timedelta(minutes=1), 2, max_replicas=2)
    policy = PolluxPolicy()

    assert(policy.allocate_job(job_1, nodes) == ["0"])
    assert(policy.allocate_job(job_2, nodes) == ["1"])
    assert(policy.allocate_job(job_3, nodes) == ["1", "1"])
    assert(policy.allocate_job(job_4, nodes) == [])
Esempio n. 3
0
def test_unusable_node():
    # Test where one of the nodes can't be used due to one resource type.
    nodes = {
        0: NodeInfo({"gpu": 1, "cpu": 500, "pods": 32}, preemptible=False),
        1: NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=False),
        2: NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=False),
    }
    template = NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=True)
    perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634,
                             0.0118, 0.00317, 1.14)
    grad_params = GradParams(sqr=0.00136, var=0.000502)
    goodput_fn = GoodputFunction(perf_params, grad_params, 128)
    speedup_fn = SpeedupFunction(goodput_fn, max_batch_size=1280,
                                 atomic_bsz_range=(64, 256))
    now = datetime.now()
    min_replicas = 0
    jobs = {
        0: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                   now + timedelta(minutes=0), min_replicas, max_replicas=1),
        1: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                   now + timedelta(minutes=1), min_replicas, max_replicas=1),
        2: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                   now + timedelta(minutes=2), min_replicas, max_replicas=1),
    }
    policy = PolluxPolicy()
    allocations, desired_nodes = policy.optimize(jobs, nodes, {}, template)
    # Check that more nodes are asked for.
    assert desired_nodes > 3
    # Check no job was allocated more than 1 replica.
    assert max(len(alloc) for alloc in allocations.values()) == 1
    # Check two jobs were allocated.
    assert sum(len(alloc) for alloc in allocations.values()) == 2
Esempio n. 4
0
 def __init__(self, nodes: List = None):
     nodes = nodes if nodes is not None else config.nodes()
     self._node_infos = {
         node['NodeManagerAddress']: NodeInfo(node['Resources'],
                                              preemptible=False)
         for node in nodes
     }
     self._default_node = cycle(list(self._node_infos))
     # Add a node template.
     self._node_template = NodeInfo(list(
         self._node_infos.values())[0].resources,
                                    preemptible=False)
     self._policy = PolluxPolicy()
Esempio n. 5
0
    def allocate(self,
                 jobs: List[AdaptDLJobMixin],
                 nodes: List = None) -> (Dict, int):
        """ Use Pollux to distribute available resources between jobs."""
        if nodes is not None:
            node_infos = {
                node['NodeManagerAddress']: NodeInfo(node['Resources'],
                                                     preemptible=False)
                for node in nodes
            }
        else:
            node_infos = self._node_infos

        assert len(jobs) > 0
        # gather JobInfos
        job_infos = {job.job_id: job.job_info for job in jobs}
        # gather previous allocations
        prev_allocs = {job.job_id: job.allocation for job in jobs}

        allocations, desired_nodes = \
            self._policy.optimize(job_infos,
                                  node_infos,
                                  prev_allocs,
                                  self._node_template)
        # Fill empty allocations for jobs which didn't get any
        for job_id in job_infos:
            allocations[job_id] = allocations.get(job_id, [])

        assert all(v == [] for k, v in allocations.items()) is False
        return allocations, desired_nodes
Esempio n. 6
0
def test_optimize(num_nodes, total_devices=16):
    assert total_devices % num_nodes == 0
    num_devices = total_devices // num_nodes
    print("{}x{} nodes:".format(num_nodes, num_devices))
    # Make up a realistic speedup function.
    params = Params(0.121, 0.00568, 0.0236, 0.00634, 0.0118, 0.00317, 1.14)
    grad_params = {"norm": 0.00136, "var": 0.000502}
    speedup_fn = SpeedupFunction(params,
                                 grad_params,
                                 init_batch_size=128,
                                 max_batch_size=1280,
                                 local_bsz_bounds=(64, 256),
                                 elastic_bsz=True)
    now = datetime.now()
    jobs = {}
    # Add a few jobs.
    job_resources = {"nvidia.com/gpu": 1, "pods": 1}
    for i in range(16):
        creation_timestamp = now + timedelta(minutes=len(jobs)),
        max_replicas = 8
        key = len(jobs)
        jobs[key] = JobInfo(job_resources, speedup_fn, creation_timestamp,
                            max_replicas)
    # Add a few nodes.
    node_resources = {"nvidia.com/gpu": num_devices, "pods": 32}
    nodes = {
        i: NodeInfo(node_resources, preemptible=False)
        for i in range(num_nodes)
    }
    # Add a node template.
    node_template = NodeInfo(node_resources, preemptible=True)
    policy = PolluxPolicy()
    prev_allocs = {}
    for i in range(3):
        start = time.time()
        allocations, desired_nodes = \
            policy.optimize(jobs, nodes, prev_allocs, node_template)
        duration = time.time() - start
        print("optimize {}x ({}s sec):".format(i + 1, duration))
        node_count = Counter()
        for job_key, placement in allocations.items():
            assert len(placement) <= jobs[job_key].max_replicas
            for node_key in placement:
                node_count[node_key] += 1
        for node_key, count in node_count.items():
            assert count <= nodes[node_key].resources["nvidia.com/gpu"]
            assert count <= nodes[node_key].resources["pods"]
Esempio n. 7
0
def test_optimize(num_nodes, total_devices=16):
    # Globals
    N_JOBS = 10
    JOBS = list(range(N_JOBS))
    random.shuffle(JOBS)

    PREEMPTIBLE_IDXS = JOBS[:len(JOBS) // 2]
    NON_PREEMPTIBLE_IDXS = JOBS[len(JOBS) // 2:]

    assert total_devices % num_nodes == 0
    num_devices = total_devices // num_nodes
    print(f"{num_nodes}x{num_devices} nodes:")
    # Make up a realistic speedup function.
    perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634, 0.0118, 0.00317,
                             1.14)
    grad_params = GradParams(sqr=0.00136, var=0.000502)
    goodput_fn = GoodputFunction(perf_params, grad_params, 128)
    speedup_fn = SpeedupFunction(goodput_fn,
                                 max_batch_size=1280,
                                 atomic_bsz_range=(64, 256))
    now = datetime.now()
    # Add a node template.
    policy = PolluxPolicy()
    job_resources = {"nvidia.com/gpu": 1, "pods": 1}
    # Add a few nodes.
    node_resources = {"nvidia.com/gpu": num_devices, "pods": 32}
    nodes = {
        i: NodeInfo(node_resources, preemptible=False)
        for i in range(num_nodes)
    }
    node_template = NodeInfo(node_resources, preemptible=True)

    # Empty allocations
    prev_allocs = {i: [] for i in JOBS}
    for cycle in range(3):
        # Start allocation cycle
        jobs = {}
        for i in PREEMPTIBLE_IDXS:
            creation_timestamp = now + timedelta(minutes=i),
            jobs[i] = JobInfo(job_resources,
                              speedup_fn,
                              creation_timestamp,
                              min_replicas=0,
                              max_replicas=8)
        for i in NON_PREEMPTIBLE_IDXS:
            creation_timestamp = now + timedelta(minutes=i),
            jobs[i] = JobInfo(job_resources,
                              speedup_fn,
                              creation_timestamp,
                              min_replicas=2,
                              max_replicas=4,
                              preemptible=False)
        start = time.time()
        assert len(jobs) > 0
        allocations, desired_nodes = \
            policy.optimize(jobs, nodes, prev_allocs, node_template)
        duration = time.time() - start
        print(f"optimize {cycle + 1}x ({duration}s sec)")
        node_count = Counter()
        for job_key, placement in allocations.items():
            assert len(placement) <= jobs[job_key].max_replicas
            if placement:
                assert len(placement) >= jobs[job_key].min_replicas
            for node_key in placement:
                node_count[node_key] += 1
        for node_key, count in node_count.items():
            assert count <= nodes[node_key].resources["nvidia.com/gpu"]
            assert count <= nodes[node_key].resources["pods"]

        # Check if we are maintaining allocations for non-preemptible jobs
        for i in NON_PREEMPTIBLE_IDXS:
            if (i in allocations) and prev_allocs[i]:
                assert allocations[i] == prev_allocs[i]

        prev_allocs = copy.deepcopy(allocations)
        # Remove one random job
        remove = random.sample(allocations.keys(), 1)[0]
        if remove in NON_PREEMPTIBLE_IDXS:
            NON_PREEMPTIBLE_IDXS.remove(remove)
            print(f"Deleting non-preemptible job {remove}")
        else:
            PREEMPTIBLE_IDXS.remove(remove)
            print(f"Deleting preemptible job {remove}")
        prev_allocs.pop(remove)