def build( self, num_tables: int, rows: Union[int, list], dims: Union[int, list], pooling: int, weighted: bool, weights_precision: str, optimizer: str, ): logger.debug( f"build: [{num_tables}, {rows}, {dims}, {pooling}, {weighted}, {weights_precision}, {optimizer}]" ) rows_list = rows if isinstance(rows, list) else [rows] dims_list = dims if isinstance(dims, list) else [dims] if self.device.startswith("cpu"): compute_device = ComputeDevice.CPU location = EmbeddingLocation.HOST elif self.device.startswith("cuda"): compute_device = ComputeDevice.CUDA location = EmbeddingLocation.DEVICE else: raise ValueError(f"Unknown compute device {self.device}") # split_table op options from actual runs of # caffe2/torch/fb/module_factory/proxy_module/grouped_sharded_embedding_bag.py self.op = SplitTableBatchedEmbeddingBagsCodegen( [( rows_list[i], dims_list[i], location, compute_device, ) for i in range(num_tables)], optimizer=OptimType(optimizer), pooling_mode=PoolingMode(pooling), weights_precision=SparseType(weights_precision), stochastic_rounding=True, cache_algorithm=CacheAlgorithm.LFU, cache_load_factor=0.0, cache_reserved_memory=12.0, ) logger.debug(f"op embedding_specs: {self.op.embedding_specs}")
def _prune_embs( self, idx: int, num_rows: int, module: split_table_batched_embeddings_ops. SplitTableBatchedEmbeddingBagsCodegen, ) -> Tuple[Tensor, Optional[Tensor]]: # TODO(yingz): Avoid DtoH / HtoD overhead. weights = module.split_embedding_weights()[idx].cpu() if self.pruning_ratio is None: return (weights, None) new_num_rows = int(math.ceil( num_rows * (1.0 - self.pruning_ratio))) # type: ignore if new_num_rows == num_rows: return (weights, None) (indicators, threshold) = self._prune_by_weights_l2_norm(new_num_rows, weights) return torch.ops.fb.embedding_bag_rowwise_prune( weights, indicators, threshold, torch.int32)
def cache( # noqa C901 alpha: float, bag_size: int, batch_size: int, cache_algorithm: str, cache_sets: int, embedding_dim: int, weights_precision: SparseType, stoc: bool, iters: int, long_index: bool, mixed: bool, num_embeddings: int, num_tables: int, reuse: float, weighted: bool, flush_gpu_cache_size_mb: int, ) -> None: np.random.seed(42) optimizer = OptimType.EXACT_ROWWISE_ADAGRAD B = batch_size D = embedding_dim L = bag_size E = num_embeddings T = num_tables cache_alg = CacheAlgorithm.LRU if cache_algorithm == "lru" else CacheAlgorithm.LFU if mixed: Ds = [ div_round_up( np.random.randint(low=int(0.5 * D), high=int(1.5 * D)), 4) for _ in range(T) ] D = np.average(Ds) else: Ds = [D] * T emb_nc = SplitTableBatchedEmbeddingBagsCodegen( [( E, d, EmbeddingLocation.MANAGED, ComputeDevice.CUDA, ) for d in Ds], optimizer=optimizer, weights_precision=weights_precision, stochastic_rounding=stoc, ).cuda() if weights_precision == SparseType.INT8: emb_nc.init_embedding_weights_uniform(-0.0003, 0.0003) emb = SplitTableBatchedEmbeddingBagsCodegen( [( E, d, EmbeddingLocation.MANAGED_CACHING, ComputeDevice.CUDA, ) for d in Ds], optimizer=optimizer, weights_precision=weights_precision, stochastic_rounding=stoc, cache_sets=cache_sets, cache_algorithm=cache_alg, ).cuda() if weights_precision == SparseType.INT8: emb.init_embedding_weights_uniform(-0.0003, 0.0003) nparams = sum(w.numel() for w in emb.split_embedding_weights()) param_size_multiplier = PRECISION_SIZE_MULTIPLIER[weights_precision] logging.info( f"Embedding tables: {E * T} rows, {nparams / 1.0e9: .2f} GParam, " f"{nparams * param_size_multiplier / 1.0e6: .2f}MB") logging.info(f"Accessed weights per batch: {B * T * L} rows, " f"{B * T * L * D * param_size_multiplier / 1.0e6: .2f}MB") requests = generate_requests(2 * iters, B, T, L, E, reuse=reuse, alpha=alpha, weighted=weighted) warmup_requests, requests = requests[:iters], requests[iters:] grad_output = torch.randn(B, sum(Ds)).cuda() time_per_iter = benchmark_requests( requests, lambda indices, offsets, per_sample_weights: emb_nc( indices.long(), offsets.long(), per_sample_weights).backward( grad_output), flush_gpu_cache_size_mb=flush_gpu_cache_size_mb, ) logging.info( f"ForwardBackward (UVM), B: {B}, E: {E}, T: {T}, D: {D}, L: {L}, " f"BW: {3 * param_size_multiplier * B * sum(Ds) * L / time_per_iter / 1.0e9: .2f}GB/s, " f"T: {time_per_iter * 1.0e6:.0f}us") # warm up for indices, offsets, _ in warmup_requests: emb.forward(indices.long(), offsets.long()) # get cache miss rate (forward and backward) and exchanged cache lines (prefetch) cache_misses = [] exchanged_cache_lines = [] NOT_FOUND = -1 for indices, offsets, _ in requests: # pyre-fixme[29]: # `Union[BoundMethod[typing.Callable(Tensor.clone)[[Named(self, # Variable[torch._TTensor (bound to Tensor)])], Variable[torch._TTensor (bound # to Tensor)]], Tensor], Tensor, torch.nn.Module]` is not a function. old_lxu_cache_state = emb.lxu_cache_state.clone() emb.prefetch(indices.long(), offsets.long()) exchanged_cache_lines.append( # pyre-fixme[16]: `bool` has no attribute `sum`. (emb.lxu_cache_state != old_lxu_cache_state).sum().item()) cache_misses.append( (emb.lxu_cache_locations_list[0] == NOT_FOUND).sum().item()) emb.forward(indices.long(), offsets.long()) logging.info( f"Exchanged cache lines -- mean: {sum(exchanged_cache_lines)/len(requests): .2f}, " f"max: {max(exchanged_cache_lines)}, min: {min(exchanged_cache_lines)}" ) logging.info(f"Cache miss -- mean: {sum(cache_misses)/len(requests)}, " f"max: {max(cache_misses)}, min: {min(cache_misses)}") # benchmark prefetch emb.reset_cache_states() for indices, offsets, _ in warmup_requests: emb.forward(indices, offsets) prefetch_time, forward_backward_time = benchmark_pipelined_requests( requests, lambda indices, offsets, indices_weights: emb.prefetch( indices, offsets), lambda indices, offsets, indices_weights: emb.forward( indices, offsets, indices_weights).backward(grad_output), flush_gpu_cache_size_mb=flush_gpu_cache_size_mb, ) e2e_time = prefetch_time + forward_backward_time logging.info( f"ForwardBackward (LXU), reuse: {reuse}, alpha: {alpha}, B: {B}, " f"E: {E}, T: {T}, D: {D}, L: {L}, " f"BW: {3 * param_size_multiplier * B * sum(Ds) * L / e2e_time / 1.0e9: .2f}GB/s, " f"Tprefetch: {prefetch_time * 1.0e6:.0f}us, " f"{2 * sum(exchanged_cache_lines) * param_size_multiplier * D / prefetch_time / len(requests) / 1.0e9: .2f} GB/s, " f"Tfwdbwd: {forward_backward_time * 1.0e6:.0f}us, " f"{3 * param_size_multiplier * B * sum(Ds) * L / forward_backward_time / 1.0e9: .2f} GB/s, " f"Te2e: {e2e_time * 1.0e6:.0f}us, ")
def uvm( alpha: bool, bag_size: int, batch_size: int, embedding_dim: int, weights_precision: SparseType, stoc: bool, iters: int, mixed: bool, num_embeddings: int, num_tables: int, reuse: float, uvm_tables: int, uvm_bag_size: int, weighted: bool, flush_gpu_cache_size_mb: int, ) -> None: np.random.seed(42) B = batch_size D = embedding_dim L = bag_size E = num_embeddings T = num_tables T_uvm = uvm_tables assert T_uvm <= T assert ( T_uvm > 0 ), f"T_uvm specified {T_uvm} <= 0. If not testing UVM, please use device benchmark." T_gpu = T - T_uvm L_uvm = uvm_bag_size if mixed: Ds = [ div_round_up( np.random.randint(low=int(0.5 * D), high=int(1.5 * D)), 4) for _ in range(T) ] D = np.average(Ds) else: Ds = [D] * T emb_uvm = SplitTableBatchedEmbeddingBagsCodegen( [( E, d, EmbeddingLocation.MANAGED, ComputeDevice.CUDA, ) for d in Ds[:T_uvm]], weights_precision=weights_precision, stochastic_rounding=stoc, ).cuda() if weights_precision == SparseType.INT8: emb_uvm.init_embedding_weights_uniform(-0.0003, 0.0003) if T_gpu > 0: emb_gpu = SplitTableBatchedEmbeddingBagsCodegen( [( E, d, EmbeddingLocation.DEVICE, ComputeDevice.CUDA, ) for d in Ds[T_uvm:]], weights_precision=weights_precision, stochastic_rounding=stoc, ).cuda() if weights_precision == SparseType.INT8: emb_gpu.init_embedding_weights_uniform(-0.0003, 0.0003) emb_mixed = SplitTableBatchedEmbeddingBagsCodegen( [( E, d, managed_option, ComputeDevice.CUDA, ) for (d, managed_option) in zip( Ds, [EmbeddingLocation.MANAGED] * T_uvm + [EmbeddingLocation.DEVICE] * T_gpu, )], weights_precision=weights_precision, stochastic_rounding=stoc, ).cuda() if weights_precision == SparseType.INT8: emb_mixed.init_embedding_weights_uniform(-0.0003, 0.0003) requests_uvm = generate_requests( iters, B, T_uvm, L_uvm, E, reuse=reuse, alpha=alpha, weights_precision=weights_precision, weighted=weighted, ) if T_gpu > 0: requests_gpu = generate_requests( iters, B, T_gpu, L, E, reuse=reuse, alpha=alpha, weights_precision=weights_precision, weighted=False, ) param_size_multiplier = PRECISION_SIZE_MULTIPLIER[weights_precision] time_per_iter = benchmark_requests( requests_uvm, lambda indices, offsets, per_sample_weights: emb_uvm.forward( indices.long(), offsets.long(), per_sample_weights, ), flush_gpu_cache_size_mb=flush_gpu_cache_size_mb, ) logging.info( f"UVM Forward, B: {B}, " f"E: {E}, T: {T_uvm}, D: {D}, L: {L_uvm}, W: {weighted}, " f"BW: {param_size_multiplier * B * sum(Ds[:T_uvm]) * L_uvm / time_per_iter / 1.0e9: .2f}GB/s, " # noqa: B950 f"T: {time_per_iter * 1.0e6:.0f}us") if T_gpu > 0: requests = [] for rs_uvm, rs_gpu in zip(requests_uvm, requests_gpu): indices = torch.cat([rs_uvm[0], rs_gpu[0]]) lengths = [L_uvm] * (T_uvm * B) + [L] * (T_gpu * B) offsets = torch.tensor( ([0] + np.cumsum(lengths).tolist())).int().cuda() per_sample_weights = None if weighted: assert (this_rs_uvm_weights := rs_uvm[2]) is not None assert (this_rs_gpu_weights := rs_gpu[2]) is not None per_sample_weights = torch.cat( [this_rs_uvm_weights, this_rs_gpu_weights]) requests.append((indices, offsets, per_sample_weights)) # forward time_per_iter = benchmark_requests( requests_gpu, lambda indices, offsets, per_sample_weights: emb_gpu.forward( indices.long(), offsets.long(), per_sample_weights, ), flush_gpu_cache_size_mb=flush_gpu_cache_size_mb, ) logging.info( f"GPU Forward, B: {B}, " f"E: {E}, T: {T_gpu}, D: {D}, L: {L}, W: {weighted}, " f"BW: {param_size_multiplier * B * sum(Ds[T_uvm:]) * L / time_per_iter / 1.0e9: .2f}GB/s, " # noqa: B950 f"T: {time_per_iter * 1.0e6:.0f}us") time_per_iter = benchmark_requests( requests, lambda indices, offsets, per_sample_weights: emb_mixed.forward( indices.long(), offsets.long(), per_sample_weights, ), flush_gpu_cache_size_mb=flush_gpu_cache_size_mb, ) logging.info( f"Mixed Forward, B: {B}, " f"E: {E}, T: {T}, D: {D}, L: {L}, W: {weighted}, " f"BW: {param_size_multiplier * B * sum(Ds) * L / time_per_iter / 1.0e9: .2f}GB/s, " # noqa: B950 f"T: {time_per_iter * 1.0e6:.0f}us")
def device( # noqa C901 alpha: float, bag_size: int, batch_size: int, embedding_dim: int, weights_precision: SparseType, stoc: bool, iters: int, managed: str, mixed: bool, num_embeddings: int, num_tables: int, reuse: float, row_wise: bool, weighted: bool, weighted_num_requires_grad: Optional[int], flush_gpu_cache_size_mb: int, ) -> None: np.random.seed(42) B = batch_size D = embedding_dim L = bag_size E = num_embeddings T = num_tables if weighted_num_requires_grad: assert weighted_num_requires_grad <= T weighted_requires_grad_tables = np.random.choice( T, replace=False, size=(weighted_num_requires_grad, )).tolist() feature_requires_grad = (torch.tensor([ 1 if t in weighted_requires_grad_tables else 0 for t in range(T) ]).to(get_device()).int()) else: feature_requires_grad = None if mixed: Ds = [ div_round_up( np.random.randint(low=int(0.5 * D), high=int(1.5 * D)), 4) for _ in range(T) ] D = np.average(Ds) else: Ds = [D] * T optimizer = OptimType.EXACT_ROWWISE_ADAGRAD if row_wise else OptimType.EXACT_ADAGRAD if managed == "device": managed_option = (EmbeddingLocation.DEVICE if torch.cuda.is_available() else EmbeddingLocation.HOST) else: managed_option = EmbeddingLocation.MANAGED emb = SplitTableBatchedEmbeddingBagsCodegen( [( E, d, managed_option, ComputeDevice.CUDA if torch.cuda.is_available() else ComputeDevice.CPU, ) for d in Ds], optimizer=optimizer, learning_rate=0.1, eps=0.1, weights_precision=weights_precision, stochastic_rounding=stoc, ).to(get_device()) if weights_precision == SparseType.INT8: emb.init_embedding_weights_uniform(-0.0003, 0.0003) nparams = sum(w.numel() for w in emb.split_embedding_weights()) param_size_multiplier = PRECISION_SIZE_MULTIPLIER[weights_precision] logging.info(f"Embedding parameters: {nparams / 1.0e9: .2f} GParam, " f"{nparams * param_size_multiplier / 1.0e9: .2f}GB") logging.info( f"Accessed weights per batch: {B * sum(Ds) * L * param_size_multiplier / 1.0e6: .2f}MB" ) requests = generate_requests( iters, B, T, L, E, reuse=reuse, alpha=alpha, weights_precision=weights_precision, weighted=weighted, ) # forward time_per_iter = benchmark_requests( requests, lambda indices, offsets, per_sample_weights: emb.forward( indices.long(), offsets.long(), per_sample_weights, feature_requires_grad=feature_requires_grad, ), flush_gpu_cache_size_mb=flush_gpu_cache_size_mb, ) logging.info( f"Forward, B: {B}, " f"E: {E}, T: {T}, D: {D}, L: {L}, W: {weighted}, " f"BW: {param_size_multiplier * B * sum(Ds) * L / time_per_iter / 1.0e9: .2f}GB/s, " # noqa: B950 f"T: {time_per_iter * 1.0e6:.0f}us") grad_output = torch.randn(B, sum(Ds)).to(get_device()) # backward time_per_iter = benchmark_requests( requests, lambda indices, offsets, per_sample_weights: emb( indices.long(), offsets.long(), per_sample_weights, feature_requires_grad=feature_requires_grad, ).backward(grad_output), flush_gpu_cache_size_mb=flush_gpu_cache_size_mb, ) logging.info( f"ForwardBackward, B: {B}, E: {E}, T: {T}, D: {D}, L: {L}, " f"BW: {3 * param_size_multiplier * B * sum(Ds) * L / time_per_iter / 1.0e9: .2f}GB/s, " f"T: {time_per_iter * 1.0e6:.0f}us")
class SplitTableBatchedEmbeddingBagsCodegenOp(OperatorInterface): def __init__(self, ): super(SplitTableBatchedEmbeddingBagsCodegenOp, self).__init__() self.op = None self.fwd_out: torch.tensor = None self.grad_in: torch.tensor = None def build( self, num_tables: int, rows: Union[int, list], dims: Union[int, list], pooling: int, weighted: bool, weights_precision: str, optimizer: str, ): logger.debug( f"build: [{num_tables}, {rows}, {dims}, {pooling}, {weighted}, {weights_precision}, {optimizer}]" ) rows_list = rows if isinstance(rows, list) else [rows] dims_list = dims if isinstance(dims, list) else [dims] if self.device.startswith("cpu"): compute_device = ComputeDevice.CPU location = EmbeddingLocation.HOST elif self.device.startswith("cuda"): compute_device = ComputeDevice.CUDA location = EmbeddingLocation.DEVICE else: raise ValueError(f"Unknown compute device {self.device}") # split_table op options from actual runs of # caffe2/torch/fb/module_factory/proxy_module/grouped_sharded_embedding_bag.py self.op = SplitTableBatchedEmbeddingBagsCodegen( [( rows_list[i], dims_list[i], location, compute_device, ) for i in range(num_tables)], optimizer=OptimType(optimizer), pooling_mode=PoolingMode(pooling), weights_precision=SparseType(weights_precision), stochastic_rounding=True, cache_algorithm=CacheAlgorithm.LFU, cache_load_factor=0.0, cache_reserved_memory=12.0, ) logger.debug(f"op embedding_specs: {self.op.embedding_specs}") def cleanup(self): logger.debug("op cleanup") self.op = None self.grad_in = None self.fwd_out = None def forward(self, *args, **kwargs): self.fwd_out = self.op.forward(args[0], args[1], args[2]) def create_grad(self): self.grad_in = torch.ones_like(self.fwd_out) def backward(self, grad=None): if grad is not None: self.fwd_out.backward(grad) else: if self.grad_in is None: self.create_grad() self.fwd_out.backward(self.grad_in)