Exemple #1
0
    def __call__(self, numerical_input, categorical_inputs):
        """Single GPU forward"""
        assert dist.get_world_size() == 1  # DONOT run this in distributed mode
        bottom_out = self.bottom_model(numerical_input, categorical_inputs)
        top_out = self.top_model(bottom_out)

        return top_out
    def __init__(self,
                 num_numerical_features: int,
                 categorical_feature_sizes: Sequence[int],
                 bottom_mlp_sizes: Sequence[int],
                 top_mlp_sizes: Sequence[int],
                 vectors_per_gpu: Sequence[int] = None,
                 embedding_device_mapping: Sequence[Sequence[int]] = None,
                 world_num_categorical_features: int = None,
                 embedding_type: str = "multi_table",
                 embedding_dim: int = 128,
                 interaction_op: str = "dot",
                 hash_indices: bool = False,
                 use_cpp_mlp: bool = False,
                 fp16: bool = False,
                 bottom_features_ordered: bool = False,
                 device: str = "cuda"):
        super().__init__()

        self.distributed = dist.get_world_size() > 1

        self._vectors_per_gpu = vectors_per_gpu
        self._embedding_dim = embedding_dim
        self._interaction_op = interaction_op
        self._hash_indices = hash_indices

        if self.distributed:
            # TODO: take bottom_mlp GPU from device mapping, do not assume it's always first
            self._device_feature_order = torch.tensor(
                [-1] +
                [i for bucket in embedding_device_mapping for i in bucket],
                dtype=torch.long,
                device=device) + 1 if bottom_features_ordered else None
            self._feature_order = self._device_feature_order.argsort(
            ) if bottom_features_ordered else None
        else:
            world_num_categorical_features = len(categorical_feature_sizes)

        interaction = create_interaction(interaction_op,
                                         world_num_categorical_features,
                                         embedding_dim)

        self.bottom_model = DlrmBottom(num_numerical_features,
                                       categorical_feature_sizes,
                                       bottom_mlp_sizes,
                                       embedding_type,
                                       embedding_dim,
                                       hash_indices=hash_indices,
                                       use_cpp_mlp=use_cpp_mlp,
                                       fp16=fp16,
                                       device=device)
        self.top_model = DlrmTop(top_mlp_sizes,
                                 interaction,
                                 use_cpp_mlp=use_cpp_mlp).to(device)
Exemple #3
0
def _dist_permutation(size):
    """Generate permutation for dataset shuffle

    Args:
        size (int): Size and high value of permutation

    Returns:
        permutation (ndarray):
    """
    if dist.get_world_size() > 1:
        # To guarantee all ranks have the same same permutation, generating it from rank 0 and sync
        # to other rank by writing to disk
        permutation_file = "/tmp/permutation.npy"
        if dist.get_local_rank() == 0:
            np.save(permutation_file, np.random.permutation(size))
        torch.distributed.barrier()
        permutation = np.load(permutation_file)
    else:
        permutation = np.random.permutation(size)

    return permutation
def dist_evaluate(model, data_loader):
    """Test distributed DLRM model

    Args:
        model (DistDLRM):
        data_loader (torch.utils.data.DataLoader):
    """
    model.eval()

    device = FLAGS.base_device
    world_size = dist.get_world_size()

    batch_sizes_per_gpu = [
        FLAGS.test_batch_size // world_size for _ in range(world_size)
    ]
    test_batch_size = sum(batch_sizes_per_gpu)

    if FLAGS.test_batch_size != test_batch_size:
        print(f"Rounded test_batch_size to {test_batch_size}")
    print(f"Batch sizes per GPU {batch_sizes_per_gpu}")

    # Test bach size could be big, make sure it prints
    default_print_freq = max(524288 * 100 // test_batch_size, 1)
    print_freq = default_print_freq if FLAGS.print_freq is None else FLAGS.print_freq

    steps_per_epoch = len(data_loader)
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'step_time', utils.SmoothedValue(window_size=1, fmt='{avg:.4f}'))

    with torch.no_grad():
        timer = utils.StepTimer()

        # ROC can be computed per batch and then compute AUC globally, but I don't have the code.
        # So pack all the outputs and labels together to compute AUC. y_true and y_score naming follows sklearn
        y_true = []
        y_score = []
        data_stream = torch.cuda.Stream()

        batch_iter = prefetcher(iter(data_loader), data_stream)

        timer.click()

        for step in range(len(data_loader)):
            numerical_features, categorical_features, click = next(batch_iter)
            torch.cuda.synchronize()

            last_batch_size = None
            if click.shape[0] != test_batch_size:  # last batch
                last_batch_size = click.shape[0]
                logging.warning("Pad the last test batch of size %d to %d",
                                last_batch_size, test_batch_size)
                padding_size = test_batch_size - last_batch_size

                if numerical_features is not None:
                    padding_numerical = torch.empty(
                        padding_size,
                        numerical_features.shape[1],
                        device=numerical_features.device,
                        dtype=numerical_features.dtype)
                    numerical_features = torch.cat(
                        (numerical_features, padding_numerical), dim=0)

                if categorical_features is not None:
                    padding_categorical = torch.ones(
                        padding_size,
                        categorical_features.shape[1],
                        device=categorical_features.device,
                        dtype=categorical_features.dtype)
                    categorical_features = torch.cat(
                        (categorical_features, padding_categorical), dim=0)

            output = model(numerical_features, categorical_features,
                           batch_sizes_per_gpu).squeeze()

            output_receive_buffer = torch.empty(test_batch_size, device=device)
            torch.distributed.all_gather(
                list(output_receive_buffer.split(batch_sizes_per_gpu)), output)
            if last_batch_size is not None:
                output_receive_buffer = output_receive_buffer[:last_batch_size]

            if FLAGS.auc_device == "CPU":
                click = click.cpu()
                output_receive_buffer = output_receive_buffer.cpu()

            y_true.append(click)
            y_score.append(output_receive_buffer)

            timer.click()

            if timer.measured is not None:
                metric_logger.update(step_time=timer.measured)
                if step % print_freq == 0 and step > 0:
                    metric_logger.print(
                        header=f"Test: [{step}/{steps_per_epoch}]")

        if is_main_process():
            auc = utils.roc_auc_score(
                torch.cat(y_true), torch.sigmoid(torch.cat(y_score).float()))
        else:
            auc = None

        torch.distributed.barrier()

    model.train()

    return auc
Exemple #5
0
def dist_evaluate(model, data_loader, data_cache):
    """Test distributed DLRM model

    Args:
        model (DistDLRM):
        data_loader (torch.utils.data.DataLoader):
    """
    world_size = dist.get_world_size()
    rank = dist.get_rank()
    device_mapping = dist_model.get_criteo_device_mapping(world_size)
    vectors_per_gpu = device_mapping['vectors_per_gpu']

    # Test batch size could be big, make sure it prints
    default_print_freq = max(16384 * 2000 // FLAGS.test_batch_size, 1)
    print_freq = default_print_freq if FLAGS.print_freq is None else FLAGS.print_freq

    steps_per_epoch = len(data_loader)
    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter(
        'step_time', utils.SmoothedValue(window_size=1, fmt='{avg:.4f} ms'))
    local_embedding_device_mapping = torch.tensor(
        device_mapping['embedding'][rank],
        device=FLAGS.device,
        dtype=torch.long)
    with torch.no_grad():
        # ROC can be computed per batch and then compute AUC globally, but I don't have the code.
        # So pack all the outputs and labels together to compute AUC. y_true and y_score naming follows sklearn
        y_true = []
        y_score = []
        data_stream = torch.cuda.Stream()
        stop_time = time()

        if data_cache is None or not data_cache:
            eval_data_iter = dataset.prefetcher(iter(data_loader), data_stream)
        else:
            print("Use cached eval data")
            eval_data_iter = data_cache
        for step, (numerical_features, categorical_features,
                   click) in enumerate(eval_data_iter):
            if data_cache is not None and len(data_cache) < steps_per_epoch:
                data_cache.append(
                    (numerical_features, categorical_features, click))
            last_batch_size = None
            if click.shape[0] != FLAGS.test_batch_size:  # last batch
                last_batch_size = click.shape[0]
                logging.debug("Pad the last test batch of size %d to %d",
                              last_batch_size, FLAGS.test_batch_size)
                padding_size = FLAGS.test_batch_size - last_batch_size
                padding_numiercal = torch.empty(
                    padding_size,
                    numerical_features.shape[1],
                    device=numerical_features.device,
                    dtype=numerical_features.dtype)
                numerical_features = torch.cat(
                    (numerical_features, padding_numiercal), dim=0)
                if categorical_features is not None:
                    padding_categorical = torch.ones(
                        padding_size,
                        categorical_features.shape[1],
                        device=categorical_features.device,
                        dtype=categorical_features.dtype)
                    categorical_features = torch.cat(
                        (categorical_features, padding_categorical), dim=0)

            if FLAGS.dataset_type != "dist":
                categorical_features = categorical_features[:,
                                                            local_embedding_device_mapping]

            if FLAGS.fp16 and categorical_features is not None:
                numerical_features = numerical_features.to(torch.float16)
            bottom_out = model.bottom_model(numerical_features,
                                            categorical_features)
            batch_size_per_gpu = FLAGS.test_batch_size // world_size
            from_bottom = dist_model.bottom_to_top(bottom_out,
                                                   batch_size_per_gpu,
                                                   model.embedding_dim,
                                                   vectors_per_gpu)

            output = model.top_model(from_bottom).squeeze()

            buffer_dtype = torch.float32 if not FLAGS.fp16 else torch.float16
            output_receive_buffer = torch.empty(FLAGS.test_batch_size,
                                                device=FLAGS.device,
                                                dtype=buffer_dtype)
            torch.distributed.all_gather(
                list(output_receive_buffer.split(batch_size_per_gpu)), output)
            if last_batch_size is not None:
                output_receive_buffer = output_receive_buffer[:last_batch_size]

            y_true.append(click)
            y_score.append(output_receive_buffer.float())

            if step % print_freq == 0 and step != 0:
                torch.cuda.synchronize()
                metric_logger.update(step_time=(time() - stop_time) * 1000 /
                                     print_freq)
                stop_time = time()
                metric_logger.print(header=F"Test: [{step}/{steps_per_epoch}]")

        auc = metrics.roc_auc_score(torch.cat(y_true),
                                    torch.sigmoid(torch.cat(y_score).float()))

    return auc