def __call__(self, numerical_input, categorical_inputs): """Single GPU forward""" assert dist.get_world_size() == 1 # DONOT run this in distributed mode bottom_out = self.bottom_model(numerical_input, categorical_inputs) top_out = self.top_model(bottom_out) return top_out
def __init__(self, num_numerical_features: int, categorical_feature_sizes: Sequence[int], bottom_mlp_sizes: Sequence[int], top_mlp_sizes: Sequence[int], vectors_per_gpu: Sequence[int] = None, embedding_device_mapping: Sequence[Sequence[int]] = None, world_num_categorical_features: int = None, embedding_type: str = "multi_table", embedding_dim: int = 128, interaction_op: str = "dot", hash_indices: bool = False, use_cpp_mlp: bool = False, fp16: bool = False, bottom_features_ordered: bool = False, device: str = "cuda"): super().__init__() self.distributed = dist.get_world_size() > 1 self._vectors_per_gpu = vectors_per_gpu self._embedding_dim = embedding_dim self._interaction_op = interaction_op self._hash_indices = hash_indices if self.distributed: # TODO: take bottom_mlp GPU from device mapping, do not assume it's always first self._device_feature_order = torch.tensor( [-1] + [i for bucket in embedding_device_mapping for i in bucket], dtype=torch.long, device=device) + 1 if bottom_features_ordered else None self._feature_order = self._device_feature_order.argsort( ) if bottom_features_ordered else None else: world_num_categorical_features = len(categorical_feature_sizes) interaction = create_interaction(interaction_op, world_num_categorical_features, embedding_dim) self.bottom_model = DlrmBottom(num_numerical_features, categorical_feature_sizes, bottom_mlp_sizes, embedding_type, embedding_dim, hash_indices=hash_indices, use_cpp_mlp=use_cpp_mlp, fp16=fp16, device=device) self.top_model = DlrmTop(top_mlp_sizes, interaction, use_cpp_mlp=use_cpp_mlp).to(device)
def _dist_permutation(size): """Generate permutation for dataset shuffle Args: size (int): Size and high value of permutation Returns: permutation (ndarray): """ if dist.get_world_size() > 1: # To guarantee all ranks have the same same permutation, generating it from rank 0 and sync # to other rank by writing to disk permutation_file = "/tmp/permutation.npy" if dist.get_local_rank() == 0: np.save(permutation_file, np.random.permutation(size)) torch.distributed.barrier() permutation = np.load(permutation_file) else: permutation = np.random.permutation(size) return permutation
def dist_evaluate(model, data_loader): """Test distributed DLRM model Args: model (DistDLRM): data_loader (torch.utils.data.DataLoader): """ model.eval() device = FLAGS.base_device world_size = dist.get_world_size() batch_sizes_per_gpu = [ FLAGS.test_batch_size // world_size for _ in range(world_size) ] test_batch_size = sum(batch_sizes_per_gpu) if FLAGS.test_batch_size != test_batch_size: print(f"Rounded test_batch_size to {test_batch_size}") print(f"Batch sizes per GPU {batch_sizes_per_gpu}") # Test bach size could be big, make sure it prints default_print_freq = max(524288 * 100 // test_batch_size, 1) print_freq = default_print_freq if FLAGS.print_freq is None else FLAGS.print_freq steps_per_epoch = len(data_loader) metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'step_time', utils.SmoothedValue(window_size=1, fmt='{avg:.4f}')) with torch.no_grad(): timer = utils.StepTimer() # ROC can be computed per batch and then compute AUC globally, but I don't have the code. # So pack all the outputs and labels together to compute AUC. y_true and y_score naming follows sklearn y_true = [] y_score = [] data_stream = torch.cuda.Stream() batch_iter = prefetcher(iter(data_loader), data_stream) timer.click() for step in range(len(data_loader)): numerical_features, categorical_features, click = next(batch_iter) torch.cuda.synchronize() last_batch_size = None if click.shape[0] != test_batch_size: # last batch last_batch_size = click.shape[0] logging.warning("Pad the last test batch of size %d to %d", last_batch_size, test_batch_size) padding_size = test_batch_size - last_batch_size if numerical_features is not None: padding_numerical = torch.empty( padding_size, numerical_features.shape[1], device=numerical_features.device, dtype=numerical_features.dtype) numerical_features = torch.cat( (numerical_features, padding_numerical), dim=0) if categorical_features is not None: padding_categorical = torch.ones( padding_size, categorical_features.shape[1], device=categorical_features.device, dtype=categorical_features.dtype) categorical_features = torch.cat( (categorical_features, padding_categorical), dim=0) output = model(numerical_features, categorical_features, batch_sizes_per_gpu).squeeze() output_receive_buffer = torch.empty(test_batch_size, device=device) torch.distributed.all_gather( list(output_receive_buffer.split(batch_sizes_per_gpu)), output) if last_batch_size is not None: output_receive_buffer = output_receive_buffer[:last_batch_size] if FLAGS.auc_device == "CPU": click = click.cpu() output_receive_buffer = output_receive_buffer.cpu() y_true.append(click) y_score.append(output_receive_buffer) timer.click() if timer.measured is not None: metric_logger.update(step_time=timer.measured) if step % print_freq == 0 and step > 0: metric_logger.print( header=f"Test: [{step}/{steps_per_epoch}]") if is_main_process(): auc = utils.roc_auc_score( torch.cat(y_true), torch.sigmoid(torch.cat(y_score).float())) else: auc = None torch.distributed.barrier() model.train() return auc
def dist_evaluate(model, data_loader, data_cache): """Test distributed DLRM model Args: model (DistDLRM): data_loader (torch.utils.data.DataLoader): """ world_size = dist.get_world_size() rank = dist.get_rank() device_mapping = dist_model.get_criteo_device_mapping(world_size) vectors_per_gpu = device_mapping['vectors_per_gpu'] # Test batch size could be big, make sure it prints default_print_freq = max(16384 * 2000 // FLAGS.test_batch_size, 1) print_freq = default_print_freq if FLAGS.print_freq is None else FLAGS.print_freq steps_per_epoch = len(data_loader) metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'step_time', utils.SmoothedValue(window_size=1, fmt='{avg:.4f} ms')) local_embedding_device_mapping = torch.tensor( device_mapping['embedding'][rank], device=FLAGS.device, dtype=torch.long) with torch.no_grad(): # ROC can be computed per batch and then compute AUC globally, but I don't have the code. # So pack all the outputs and labels together to compute AUC. y_true and y_score naming follows sklearn y_true = [] y_score = [] data_stream = torch.cuda.Stream() stop_time = time() if data_cache is None or not data_cache: eval_data_iter = dataset.prefetcher(iter(data_loader), data_stream) else: print("Use cached eval data") eval_data_iter = data_cache for step, (numerical_features, categorical_features, click) in enumerate(eval_data_iter): if data_cache is not None and len(data_cache) < steps_per_epoch: data_cache.append( (numerical_features, categorical_features, click)) last_batch_size = None if click.shape[0] != FLAGS.test_batch_size: # last batch last_batch_size = click.shape[0] logging.debug("Pad the last test batch of size %d to %d", last_batch_size, FLAGS.test_batch_size) padding_size = FLAGS.test_batch_size - last_batch_size padding_numiercal = torch.empty( padding_size, numerical_features.shape[1], device=numerical_features.device, dtype=numerical_features.dtype) numerical_features = torch.cat( (numerical_features, padding_numiercal), dim=0) if categorical_features is not None: padding_categorical = torch.ones( padding_size, categorical_features.shape[1], device=categorical_features.device, dtype=categorical_features.dtype) categorical_features = torch.cat( (categorical_features, padding_categorical), dim=0) if FLAGS.dataset_type != "dist": categorical_features = categorical_features[:, local_embedding_device_mapping] if FLAGS.fp16 and categorical_features is not None: numerical_features = numerical_features.to(torch.float16) bottom_out = model.bottom_model(numerical_features, categorical_features) batch_size_per_gpu = FLAGS.test_batch_size // world_size from_bottom = dist_model.bottom_to_top(bottom_out, batch_size_per_gpu, model.embedding_dim, vectors_per_gpu) output = model.top_model(from_bottom).squeeze() buffer_dtype = torch.float32 if not FLAGS.fp16 else torch.float16 output_receive_buffer = torch.empty(FLAGS.test_batch_size, device=FLAGS.device, dtype=buffer_dtype) torch.distributed.all_gather( list(output_receive_buffer.split(batch_size_per_gpu)), output) if last_batch_size is not None: output_receive_buffer = output_receive_buffer[:last_batch_size] y_true.append(click) y_score.append(output_receive_buffer.float()) if step % print_freq == 0 and step != 0: torch.cuda.synchronize() metric_logger.update(step_time=(time() - stop_time) * 1000 / print_freq) stop_time = time() metric_logger.print(header=F"Test: [{step}/{steps_per_epoch}]") auc = metrics.roc_auc_score(torch.cat(y_true), torch.sigmoid(torch.cat(y_score).float())) return auc