Esempio n. 1
0
def run(rank, size, run_id):
    """ Distributed Synchronous SGD Example """
    torch.manual_seed(1234)
    train_set, bsz = partition_dataset()
    model = Net()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

    api_client = ApiClient(in_cluster=True,
                           k8s_namespace='default',
                           label_selector='component=master,app=mlbench')

    num_batches = ceil(len(train_set.dataset) / float(bsz))
    for epoch in range(10):
        epoch_loss = 0.0
        for data, target in train_set:
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            epoch_loss += loss.data.item()
            loss.backward()
            average_gradients(model)
            optimizer.step()
        logging.debug('Rank %s, epoch %s: %s', dist.get_rank(), epoch,
                      epoch_loss / num_batches)

        api_client.post_metric(run_id, "Rank {} loss".format(rank),
                               epoch_loss / num_batches)
Esempio n. 2
0
def log_metrics(run_id, rank, epoch, metric_name, value, tracker=None, time=None):
    """ Log metrics to mlbench master/dashboard

    Args:
        run_id (str): The id of the current run
        rank (int): The rank of the current worker
        epoch (int): The current epoch
        metric_name (str): The name of the metric to save
        value (Any): The metric value
    """
    in_cluster = os.getenv("MLBENCH_IN_DOCKER") is None

    metric_name = "{} @ {}".format(metric_name, rank)

    if in_cluster:
        api = ApiClient()
        api.post_metric(
            run_id,
            metric_name,
            value,
            metadata="{{rank: {}, epoch:{}}}".format(rank, epoch),
        )

    if tracker and time:
        tracker.records.append(
            {
                "run_id": run_id,
                "name": metric_name,
                "cumulative": True,
                "date": str(datetime.datetime.now()),
                "time": str(time),
                "value": str(value),
                "metadata": "{{rank: {}, epoch:{}}}".format(rank, epoch),
            }
        )
Esempio n. 3
0
def test_post_metrics(mocker, kubernetes_api_client_node_port):
    mocker.patch("kubernetes.config.load_kube_config")
    rg = mocker.patch("concurrent.futures.ThreadPoolExecutor")
    rg.return_value.submit.return_value.result.return_value.json.return_value = "a"

    client = ApiClient(in_cluster=False)

    result = client.post_metric("1", "loss", 10.0, cumulative=False)

    assert result is not None
    assert result.result().json() == "a"