def run(rank, size, run_id): """ Distributed Synchronous SGD Example """ torch.manual_seed(1234) train_set, bsz = partition_dataset() model = Net() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) api_client = ApiClient(in_cluster=True, k8s_namespace='default', label_selector='component=master,app=mlbench') num_batches = ceil(len(train_set.dataset) / float(bsz)) for epoch in range(10): epoch_loss = 0.0 for data, target in train_set: optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) epoch_loss += loss.data.item() loss.backward() average_gradients(model) optimizer.step() logging.debug('Rank %s, epoch %s: %s', dist.get_rank(), epoch, epoch_loss / num_batches) api_client.post_metric(run_id, "Rank {} loss".format(rank), epoch_loss / num_batches)
def log_metrics(run_id, rank, epoch, metric_name, value, tracker=None, time=None): """ Log metrics to mlbench master/dashboard Args: run_id (str): The id of the current run rank (int): The rank of the current worker epoch (int): The current epoch metric_name (str): The name of the metric to save value (Any): The metric value """ in_cluster = os.getenv("MLBENCH_IN_DOCKER") is None metric_name = "{} @ {}".format(metric_name, rank) if in_cluster: api = ApiClient() api.post_metric( run_id, metric_name, value, metadata="{{rank: {}, epoch:{}}}".format(rank, epoch), ) if tracker and time: tracker.records.append( { "run_id": run_id, "name": metric_name, "cumulative": True, "date": str(datetime.datetime.now()), "time": str(time), "value": str(value), "metadata": "{{rank: {}, epoch:{}}}".format(rank, epoch), } )
def test_post_metrics(mocker, kubernetes_api_client_node_port): mocker.patch("kubernetes.config.load_kube_config") rg = mocker.patch("concurrent.futures.ThreadPoolExecutor") rg.return_value.submit.return_value.result.return_value.json.return_value = "a" client = ApiClient(in_cluster=False) result = client.post_metric("1", "loss", 10.0, cumulative=False) assert result is not None assert result.result().json() == "a"