Esempio n. 1
0
def test_fit_2():
    # Tests goodput.fit's ability to fit to data generated
    # by its own model class with arbitrary parameters, with
    # gradient accumulation. Serves as a sanity check
    # that the goodput.model fitting works in the most
    # optimistic case.
    size = (1000, )
    nodes = np.random.randint(low=1, high=11, size=size)
    replicas = np.random.randint(low=1, high=nodes + 1, size=size)
    local_bsz = np.random.randint(32, 1024, size=size)
    params = goodput.PerfParams(0.1, 0.01, 0.5, 1.0, 1e-6, 1e-6, 1.2)
    accum_step_time = goodput._predict_accum_time(params, local_bsz) + \
        np.maximum(np.random.normal(0, 0.01, size=size), 0.0)
    network_time = goodput._predict_network_time(params, nodes, replicas) + \
        np.maximum(np.random.normal(0, 0.01, size=size), 0.0)
    gamma = params.gamma
    optim_step_time = (accum_step_time**gamma + network_time**gamma)**(1 /
                                                                       gamma)
    result = goodput.fit_perf_params(nodes, replicas, local_bsz,
                                     accum_step_time, optim_step_time)
    loss_result = goodput._obj_fn(result, nodes, replicas, local_bsz,
                                  accum_step_time, optim_step_time)
    loss_true = goodput._obj_fn(params, nodes, replicas, local_bsz,
                                accum_step_time, optim_step_time)
    assert(abs(loss_result - loss_true) < 0.1 * loss_true
           or loss_result < loss_true), \
        ("goodput.fit failed to fit model from data generated by",
         "goodput.PerfParams(0.1, 0.01, 0.5, 1.0, 1e-6, 1e-6, 1.2)",
         "parameters: {}".format(result))
Esempio n. 2
0
def _fit_perf_params():
    state = _metrics_state()
    items = state.profile.items()
    items = [item for item in items if item[1]["count"] > 0]
    keys = [item[0] for item in items]
    values = [item[1] for item in items]
    num_nodes, num_replicas, local_bsz, accumulation_steps = \
        (np.array(val) for val in zip(*keys))
    step_time = np.array([val["step_time"] / val["count"] for val in values])
    sync_time = np.array([val["sync_time"] / val["count"] for val in values])
    accumulation_time = np.array(
            [val["accumulation_step_time"] / val["accumulation_count"]
             if val["accumulation_count"] > 0 else 0.0
             for val in values])
    compute_time = step_time - sync_time
    accumulation_time = np.where(
        accumulation_steps > 0, accumulation_time, compute_time)
    state.perf_params = fit_perf_params(
        num_nodes, num_replicas, local_bsz, accumulation_steps,
        step_time, compute_time, accumulation_time)
Esempio n. 3
0
def _fit_perf_params():
    state = _metrics_state()
    profile = {k: v for k, v in state.profile.items() if v.get("optim_count")}
    # Convert profile into numpy arrays.
    num_nodes, num_replicas, atomic_bsz = (
        np.array(k) for k in zip(*profile.keys()))
    accum_step_time = np.array([v.get("accum_step_time", 0.0)
                                for v in profile.values()])
    accum_count = np.array([v.get("accum_count", 0) for v in profile.values()])
    optim_step_time = np.array([v.get("optim_step_time", 0.0)
                                for v in profile.values()])
    optim_sync_time = np.array([v.get("optim_sync_time", 0.0)
                                for v in profile.values()])
    optim_count = np.array([v.get("optim_count", 0) for v in profile.values()])
    assert np.all(optim_count > 0)
    # Non-sync time during optimization steps should be approximately equal to
    # accumulation step time, combine those data points.
    assert np.all(optim_step_time >= optim_sync_time)
    accum_step_time += optim_step_time - optim_sync_time
    accum_count += optim_count
    accum_step_time /= accum_count
    optim_step_time /= optim_count
    state.perf_params = fit_perf_params(num_nodes, num_replicas, atomic_bsz,
                                        accum_step_time, optim_step_time)