Example #1
0
def test_allocate_job():
    nodes = {
        "0": NodeInfo({"gpu": 1, "cpu": 500, "pods": 32}, preemptible=False),
        "1": NodeInfo({"gpu": 2, "cpu": 2000, "pods": 32}, preemptible=False),
        "2": NodeInfo({"gpu": 2, "cpu": 3000, "pods": 32}, preemptible=True),
    }
    perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634,
                             0.0118, 0.00317, 1.14)
    grad_params = GradParams(sqr=0.00136, var=0.000502)
    goodput_fn = GoodputFunction(perf_params, grad_params, 128)
    speedup_fn = SpeedupFunction(goodput_fn, max_batch_size=1280,
                                 atomic_bsz_range=(64, 256))
    now = datetime.now()
    min_replicas = 0
    job_1 = JobInfo({"gpu": 1, "cpu": 500, "pods": 1}, speedup_fn,
                    now + timedelta(minutes=0), min_replicas, max_replicas=1)
    job_2 = JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                    now + timedelta(minutes=1), min_replicas, max_replicas=1)
    job_3 = JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                    now + timedelta(minutes=1), 2, max_replicas=2)
    job_4 = JobInfo({"gpu": 1, "cpu": 2000, "pods": 1}, speedup_fn,
                    now + timedelta(minutes=1), 2, max_replicas=2)
    policy = PolluxPolicy()

    assert(policy.allocate_job(job_1, nodes) == ["0"])
    assert(policy.allocate_job(job_2, nodes) == ["1"])
    assert(policy.allocate_job(job_3, nodes) == ["1", "1"])
    assert(policy.allocate_job(job_4, nodes) == [])
Example #2
0
def test_unusable_node():
    # Test where one of the nodes can't be used due to one resource type.
    nodes = {
        0: NodeInfo({"gpu": 1, "cpu": 500, "pods": 32}, preemptible=False),
        1: NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=False),
        2: NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=False),
    }
    template = NodeInfo({"gpu": 1, "cpu": 8000, "pods": 32}, preemptible=True)
    perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634,
                             0.0118, 0.00317, 1.14)
    grad_params = GradParams(sqr=0.00136, var=0.000502)
    goodput_fn = GoodputFunction(perf_params, grad_params, 128)
    speedup_fn = SpeedupFunction(goodput_fn, max_batch_size=1280,
                                 atomic_bsz_range=(64, 256))
    now = datetime.now()
    min_replicas = 0
    jobs = {
        0: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                   now + timedelta(minutes=0), min_replicas, max_replicas=1),
        1: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                   now + timedelta(minutes=1), min_replicas, max_replicas=1),
        2: JobInfo({"gpu": 1, "cpu": 1000, "pods": 1}, speedup_fn,
                   now + timedelta(minutes=2), min_replicas, max_replicas=1),
    }
    policy = PolluxPolicy()
    allocations, desired_nodes = policy.optimize(jobs, nodes, {}, template)
    # Check that more nodes are asked for.
    assert desired_nodes > 3
    # Check no job was allocated more than 1 replica.
    assert max(len(alloc) for alloc in allocations.values()) == 1
    # Check two jobs were allocated.
    assert sum(len(alloc) for alloc in allocations.values()) == 2
Example #3
0
 def hints(self):
     hints = copy.deepcopy(self._last_metrics)
     if hints:
         if hints["gradParams"]:
             hints["gradParams"]["sqr"] = hints["gradParams"]["norm"]
             del hints["gradParams"]["norm"]
         else:
             hints["gradParams"] = {"sqr": 1.0, "var": 1.0}
         hints["gradParams"] = GradParams(**hints["gradParams"])
         hints["perfParams"] = PerfParams(**hints["perfParams"])
     return hints
Example #4
0
def test_optimize(num_nodes, total_devices=16):
    assert total_devices % num_nodes == 0
    num_devices = total_devices // num_nodes
    print("{}x{} nodes:".format(num_nodes, num_devices))
    # Make up a realistic speedup function.
    perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634,
                             0.0118, 0.00317, 1.14)
    grad_params = GradParams(sqr=0.00136, var=0.000502)
    goodput_fn = GoodputFunction(perf_params, grad_params, 128)
    speedup_fn = SpeedupFunction(goodput_fn, max_batch_size=1280,
                                 atomic_bsz_range=(64, 256))
    now = datetime.now()
    jobs = {}
    # Add a few jobs.
    job_resources = {"nvidia.com/gpu": 1, "pods": 1}
    for i in range(16):
        creation_timestamp = now + timedelta(minutes=len(jobs)),
        max_replicas = 8
        min_replicas = 0
        key = len(jobs)
        jobs[key] = JobInfo(job_resources, speedup_fn, creation_timestamp,
                            min_replicas, max_replicas)
    # Add a few nodes.
    node_resources = {"nvidia.com/gpu": num_devices, "pods": 32}
    nodes = {i: NodeInfo(node_resources, preemptible=False)
             for i in range(num_nodes)}
    # Add a node template.
    node_template = NodeInfo(node_resources, preemptible=True)
    policy = PolluxPolicy()
    prev_allocs = {}
    for i in range(3):
        start = time.time()
        allocations, desired_nodes = \
            policy.optimize(jobs, nodes, prev_allocs, node_template)
        duration = time.time() - start
        print("optimize {}x ({}s sec):".format(i + 1, duration))
        node_count = Counter()
        for job_key, placement in allocations.items():
            assert len(placement) <= jobs[job_key].max_replicas
            for node_key in placement:
                node_count[node_key] += 1
        for node_key, count in node_count.items():
            assert count <= nodes[node_key].resources["nvidia.com/gpu"]
            assert count <= nodes[node_key].resources["pods"]
Example #5
0
 def _get_job_info(self, job):
     job["spec"]["template"]["spec"] = \
         set_default_resources(job["spec"]["template"]["spec"])
     resources = get_pod_requests(job["spec"]["template"]["spec"])
     hints = job.get("status", {}).get("train", {})
     max_replicas = max(2 * hints.get("maxProfiledReplicas", 0), 1)
     if job["spec"].get("maxReplicas"):
         max_replicas = min(max_replicas, job["spec"]["maxReplicas"])
     min_replicas = job["spec"].get("minReplicas", 0)
     # max_replicas should be greater or equal to min_replicas
     max_replicas = max(max_replicas, min_replicas)
     preemptible = job["spec"].get("preemptible", True)
     if {"perfParams", "initBatchSize"} <= hints.keys() and preemptible:
         max_batch_size = (hints.get("maxBatchSize")
                           or hints["initBatchSize"])
         if hints.get("localBszBounds"):
             min_local_bsz = hints["localBszBounds"][0] or 1
             # Make sure max_batch_size / replicas >= min_local_bsz
             if max_batch_size < min_local_bsz * max_replicas:
                 max_replicas = int(max_batch_size / min_local_bsz)
         perf_params = PerfParams(
             *[hints["perfParams"][k] for k in PERF_PARAMS.keys()])
         if "gradParams" in hints:
             grad_params = GradParams(hints["gradParams"]["norm"],
                                      hints["gradParams"]["var"])
         else:
             grad_params = GradParams(0.0, 1.0)
         goodput_fn = GoodputFunction(perf_params, grad_params,
                                      hints["initBatchSize"],
                                      self._metrics_options)
         speedup_fn = SpeedupFunction(
             goodput_fn, hints.get("maxBatchSize"),
             hints.get("localBszBounds"),
             hints.get("gradientAccumulation", False))
     else:
         speedup_fn = lambda n, r: r  # noqa: E731
     creation_ts = dateutil.parser.isoparse(
         job["metadata"]["creationTimestamp"])
     return JobInfo(resources, speedup_fn, creation_ts, min_replicas,
                    max_replicas, preemptible)
Example #6
0
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from adaptdl.goodput import GoodputFunction, PerfParams, GradParams
import itertools
import numpy as np
import pytest

RNG = np.random.RandomState(0)
PERF_PARAMS = [PerfParams(*RNG.gamma(2.0, 2.0, [7])) for i in range(10)]
GRAD_PARAMS = [GradParams(*RNG.gamma(2.0, 2.0, [2])) for i in range(10)]


def groupby_indices(*args):
    _, indices = np.unique(np.stack(args), axis=1, return_inverse=True)
    groups = {}
    for i, g in enumerate(indices):
        groups.setdefault(g, []).append(i)
    return list(groups.values())


@pytest.mark.parametrize("perf_params", PERF_PARAMS)
@pytest.mark.parametrize("grad_params", GRAD_PARAMS)
def test_evaluate(perf_params, grad_params):
    init_batch_size = 16
Example #7
0
def test_optimize(num_nodes, total_devices=16):
    # Globals
    N_JOBS = 10
    JOBS = list(range(N_JOBS))
    random.shuffle(JOBS)

    PREEMPTIBLE_IDXS = JOBS[:len(JOBS) // 2]
    NON_PREEMPTIBLE_IDXS = JOBS[len(JOBS) // 2:]

    assert total_devices % num_nodes == 0
    num_devices = total_devices // num_nodes
    print(f"{num_nodes}x{num_devices} nodes:")
    # Make up a realistic speedup function.
    perf_params = PerfParams(0.121, 0.00568, 0.0236, 0.00634, 0.0118, 0.00317,
                             1.14)
    grad_params = GradParams(sqr=0.00136, var=0.000502)
    goodput_fn = GoodputFunction(perf_params, grad_params, 128)
    speedup_fn = SpeedupFunction(goodput_fn,
                                 max_batch_size=1280,
                                 atomic_bsz_range=(64, 256))
    now = datetime.now()
    # Add a node template.
    policy = PolluxPolicy()
    job_resources = {"nvidia.com/gpu": 1, "pods": 1}
    # Add a few nodes.
    node_resources = {"nvidia.com/gpu": num_devices, "pods": 32}
    nodes = {
        i: NodeInfo(node_resources, preemptible=False)
        for i in range(num_nodes)
    }
    node_template = NodeInfo(node_resources, preemptible=True)

    # Empty allocations
    prev_allocs = {i: [] for i in JOBS}
    for cycle in range(3):
        # Start allocation cycle
        jobs = {}
        for i in PREEMPTIBLE_IDXS:
            creation_timestamp = now + timedelta(minutes=i),
            jobs[i] = JobInfo(job_resources,
                              speedup_fn,
                              creation_timestamp,
                              min_replicas=0,
                              max_replicas=8)
        for i in NON_PREEMPTIBLE_IDXS:
            creation_timestamp = now + timedelta(minutes=i),
            jobs[i] = JobInfo(job_resources,
                              speedup_fn,
                              creation_timestamp,
                              min_replicas=2,
                              max_replicas=4,
                              preemptible=False)
        start = time.time()
        assert len(jobs) > 0
        allocations, desired_nodes = \
            policy.optimize(jobs, nodes, prev_allocs, node_template)
        duration = time.time() - start
        print(f"optimize {cycle + 1}x ({duration}s sec)")
        node_count = Counter()
        for job_key, placement in allocations.items():
            assert len(placement) <= jobs[job_key].max_replicas
            if placement:
                assert len(placement) >= jobs[job_key].min_replicas
            for node_key in placement:
                node_count[node_key] += 1
        for node_key, count in node_count.items():
            assert count <= nodes[node_key].resources["nvidia.com/gpu"]
            assert count <= nodes[node_key].resources["pods"]

        # Check if we are maintaining allocations for non-preemptible jobs
        for i in NON_PREEMPTIBLE_IDXS:
            if (i in allocations) and prev_allocs[i]:
                assert allocations[i] == prev_allocs[i]

        prev_allocs = copy.deepcopy(allocations)
        # Remove one random job
        remove = random.sample(allocations.keys(), 1)[0]
        if remove in NON_PREEMPTIBLE_IDXS:
            NON_PREEMPTIBLE_IDXS.remove(remove)
            print(f"Deleting non-preemptible job {remove}")
        else:
            PREEMPTIBLE_IDXS.remove(remove)
            print(f"Deleting preemptible job {remove}")
        prev_allocs.pop(remove)