Ejemplo n.º 1
0
def inference_benchmark_nongraphed(model, data_loader, num_batches=100):
    model.eval()
    base_device = FLAGS.base_device
    latencies = []

    y_true = []
    y_score = []

    with torch.no_grad():
        for step, (numerical_features, categorical_features, click) in enumerate(data_loader):
            if step > num_batches:
                break

            step_start_time = time()

            numerical_features = numerical_features.to(base_device)
            if FLAGS.amp:
                numerical_features = numerical_features.half()

            categorical_features = categorical_features.to(device=base_device, dtype=torch.int64)

            inference_result = model(numerical_features, categorical_features).squeeze()
            torch.cuda.synchronize()
            step_time = time() - step_start_time

            if step >= FLAGS.benchmark_warmup_steps:
                latencies.append(step_time)

            y_true.append(click)
            y_score.append(inference_result.reshape([-1]).clone())

    y_true = torch.cat(y_true)
    y_score = torch.sigmoid(torch.cat(y_score)).float()
    auc = utils.roc_auc_score(y_true, y_score)
    print('auc: ', auc)

    return latencies
Ejemplo n.º 2
0
def evaluate(model, loss_fn, data_loader):
    """Test dlrm model

    Args:
        model (dlrm):
        loss_fn (torch.nn.Module): Loss function
        data_loader (torch.utils.data.DataLoader):
    """
    model.eval()
    print_freq = FLAGS.print_freq
    prefetching_enabled = is_data_prefetching_enabled()

    steps_per_epoch = len(data_loader)
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'loss', utils.SmoothedValue(window_size=1, fmt='{avg:.4f}'))
    metric_logger.add_meter(
        'step_time', utils.SmoothedValue(window_size=1, fmt='{avg:.4f}'))

    if prefetching_enabled:
        data_stream = torch.cuda.Stream()

    with torch.no_grad():
        y_true = []
        y_score = []

        timer = utils.StepTimer()
        timer.click()

        input_pipeline = iter(data_loader)

        if prefetching_enabled:
            input_pipeline = prefetcher(input_pipeline, data_stream)

        for step, (numerical_features, categorical_features,
                   click) in enumerate(input_pipeline):
            if FLAGS.amp:
                numerical_features = numerical_features.half()

            if prefetching_enabled:
                torch.cuda.synchronize()

            output = model(numerical_features, categorical_features).squeeze()

            loss = loss_fn(output, click)
            y_true.append(click)
            y_score.append(output)

            loss_value = loss.item()
            timer.click()

            if timer.measured is not None:
                metric_logger.update(loss=loss_value, step_time=timer.measured)
                if step % print_freq == 0 and step > 0:
                    metric_logger.print(
                        header=f"Test: [{step}/{steps_per_epoch}]")

        y_true = torch.cat(y_true)
        y_score = torch.cat(y_score)

        before_auc_timestamp = time()
        auc = utils.roc_auc_score(y_true=y_true, y_score=y_score)
        print(f'AUC computation took: {time() - before_auc_timestamp:.2f} [s]')

    model.train()

    return metric_logger.loss.global_avg, auc, metric_logger.step_time.avg
def dist_evaluate(model, data_loader):
    """Test distributed DLRM model

    Args:
        model (DistDLRM):
        data_loader (torch.utils.data.DataLoader):
    """
    model.eval()

    device = FLAGS.base_device
    world_size = dist.get_world_size()

    batch_sizes_per_gpu = [
        FLAGS.test_batch_size // world_size for _ in range(world_size)
    ]
    test_batch_size = sum(batch_sizes_per_gpu)

    if FLAGS.test_batch_size != test_batch_size:
        print(f"Rounded test_batch_size to {test_batch_size}")
    print(f"Batch sizes per GPU {batch_sizes_per_gpu}")

    # Test bach size could be big, make sure it prints
    default_print_freq = max(524288 * 100 // test_batch_size, 1)
    print_freq = default_print_freq if FLAGS.print_freq is None else FLAGS.print_freq

    steps_per_epoch = len(data_loader)
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'step_time', utils.SmoothedValue(window_size=1, fmt='{avg:.4f}'))

    with torch.no_grad():
        timer = utils.StepTimer()

        # ROC can be computed per batch and then compute AUC globally, but I don't have the code.
        # So pack all the outputs and labels together to compute AUC. y_true and y_score naming follows sklearn
        y_true = []
        y_score = []
        data_stream = torch.cuda.Stream()

        batch_iter = prefetcher(iter(data_loader), data_stream)

        timer.click()

        for step in range(len(data_loader)):
            numerical_features, categorical_features, click = next(batch_iter)
            torch.cuda.synchronize()

            last_batch_size = None
            if click.shape[0] != test_batch_size:  # last batch
                last_batch_size = click.shape[0]
                logging.warning("Pad the last test batch of size %d to %d",
                                last_batch_size, test_batch_size)
                padding_size = test_batch_size - last_batch_size

                if numerical_features is not None:
                    padding_numerical = torch.empty(
                        padding_size,
                        numerical_features.shape[1],
                        device=numerical_features.device,
                        dtype=numerical_features.dtype)
                    numerical_features = torch.cat(
                        (numerical_features, padding_numerical), dim=0)

                if categorical_features is not None:
                    padding_categorical = torch.ones(
                        padding_size,
                        categorical_features.shape[1],
                        device=categorical_features.device,
                        dtype=categorical_features.dtype)
                    categorical_features = torch.cat(
                        (categorical_features, padding_categorical), dim=0)

            output = model(numerical_features, categorical_features,
                           batch_sizes_per_gpu).squeeze()

            output_receive_buffer = torch.empty(test_batch_size, device=device)
            torch.distributed.all_gather(
                list(output_receive_buffer.split(batch_sizes_per_gpu)), output)
            if last_batch_size is not None:
                output_receive_buffer = output_receive_buffer[:last_batch_size]

            if FLAGS.auc_device == "CPU":
                click = click.cpu()
                output_receive_buffer = output_receive_buffer.cpu()

            y_true.append(click)
            y_score.append(output_receive_buffer)

            timer.click()

            if timer.measured is not None:
                metric_logger.update(step_time=timer.measured)
                if step % print_freq == 0 and step > 0:
                    metric_logger.print(
                        header=f"Test: [{step}/{steps_per_epoch}]")

        if is_main_process():
            auc = utils.roc_auc_score(
                torch.cat(y_true), torch.sigmoid(torch.cat(y_score).float()))
        else:
            auc = None

        torch.distributed.barrier()

    model.train()

    return auc
Ejemplo n.º 4
0
def inference_benchmark_graphed(model, data_loader, num_batches=100):
    model.eval()
    base_device = FLAGS.base_device
    latencies = []

    data_iter = iter(data_loader)
    numerical, categorical, _ = next(data_iter)

    # Warmup before capture
    s = torch.cuda.Stream()
    static_numerical = numerical.to(base_device)
    static_categorical = categorical.to(device=base_device, dtype=torch.int64)
    s.wait_stream(torch.cuda.current_stream())
    with torch.cuda.stream(s):
        for i in range(10):
            if FLAGS.amp:
                numerical = static_numerical.half()
            else:
                numerical = static_numerical
            inference_result = model(numerical, static_categorical).squeeze()

    torch.cuda.synchronize()

    # Graph capture
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        if FLAGS.amp:
            numerical = static_numerical.half()
        else:
            numerical = static_numerical
        inference_result = model(numerical, static_categorical).squeeze()

    torch.cuda.synchronize()
    # Inference
    y_true = []
    y_score = []

    with torch.no_grad():
        for step, (numerical_features, categorical_features, click) in enumerate(data_loader):
            if step > num_batches:
                break
            torch.cuda.synchronize()
            step_start_time = time()

            numerical_features = numerical_features.to(base_device)
            categorical_features = categorical_features.to(device=base_device, dtype=torch.int64)

            static_categorical.copy_(categorical_features)
            static_numerical.copy_(numerical_features)
            graph.replay()
            torch.cuda.synchronize()
            step_time = time() - step_start_time

            if step >= FLAGS.benchmark_warmup_steps:
                latencies.append(step_time)
            y_true.append(click)
            y_score.append(inference_result.reshape([-1]).clone())
    y_true = torch.cat(y_true)
    y_score = torch.sigmoid(torch.cat(y_score)).float()
    auc = utils.roc_auc_score(y_true, y_score)
    print('auc: ', auc)
    return latencies