def getBiBW(self, numIters, memSize): logging.debug("STATUS: begin BiBW test.") self.collectiveArgs.asyncOp = True # get bidirectional bandwidth biLatencyNS = [] for _ in range(numIters): self.backendFuncs.sync_barrier(self.collectiveArgs) start = time.monotonic() for w in range(self.collectiveArgs.window): if self.collectiveArgs.global_rank == self.collectiveArgs.src_rank: self.backendFuncs.isend(self.collectiveArgs, self.collectiveArgs.dst_rank, tag=w) self.backendFuncs.irecv(self.collectiveArgs, self.collectiveArgs.dst_rank, tag=w + self.collectiveArgs.window) elif self.collectiveArgs.global_rank == self.collectiveArgs.dst_rank: self.backendFuncs.irecv(self.collectiveArgs, self.collectiveArgs.src_rank, tag=w) self.backendFuncs.isend(self.collectiveArgs, self.collectiveArgs.src_rank, tag=w + self.collectiveArgs.window) self.backendFuncs.complete_accel_ops(self.collectiveArgs) biLatencyNS.append( (time.monotonic() - start) * 1e9 ) # keeping time in NS, helps in divising data by nanosecond biLatencyNS = [lat / self.collectiveArgs.window for lat in biLatencyNS] biLatencyNS = np.mean(np.array(biLatencyNS)) _, avgBiBW = comms_utils.getAlgBW(biLatencyNS, 2 * memSize, 1) logging.debug("STATUS: end UniBW test.") return avgBiBW
def test_no_iterations(self): elapsedTimeNs = 30000 dataSize = 90000 # bytes numIters = 0 (avgIterNS, algBW) = comms_utils.getAlgBW(elapsedTimeNs, dataSize, numIters) # If we had no iterations, then we have no avg iteration time or algBW. self.assertEqual(0.0, avgIterNS, algBW)
def getUniBW(self, numIters, memSize): logger.debug( "STATUS: begin UniBW test with src_ranks=%s, dst_ranks=%s." % (self.collectiveArgs.src_ranks, self.collectiveArgs.dst_ranks)) self.collectiveArgs.asyncOp = True # get unidirectional bandwidth uniLatencyNS = [] for _ in range(numIters): self.backendFuncs.sync_barrier(self.collectiveArgs) start = time.monotonic() for w in range(self.collectiveArgs.window): if self.collectiveArgs.global_rank in self.collectiveArgs.src_ranks: idx = self.collectiveArgs.src_ranks.index( self.collectiveArgs.global_rank) self.backendFuncs.isend(self.collectiveArgs, self.collectiveArgs.dst_ranks[idx], tag=w) elif self.collectiveArgs.global_rank in self.collectiveArgs.dst_ranks: idx = self.collectiveArgs.dst_ranks.index( self.collectiveArgs.global_rank) self.backendFuncs.irecv(self.collectiveArgs, self.collectiveArgs.src_ranks[idx], tag=w) self.backendFuncs.complete_accel_ops(self.collectiveArgs) uniLatencyNS.append( (time.monotonic() - start) * 1e9 ) # keeping time in NS, helps in divising data by nanosecond uniLatencyNS = [ lat / self.collectiveArgs.window for lat in uniLatencyNS ] uniLatencyNS = np.mean(np.array(uniLatencyNS)) _, avgUniBW = comms_utils.getAlgBW(uniLatencyNS, memSize, 1) logger.debug("STATUS: end UniBW test.") return avgUniBW
def test_iterations(self): elapsedTimeNs = 30000 dataSize = 90000 # bytes numIters = 3 (avgIterNS, algBW) = comms_utils.getAlgBW(elapsedTimeNs, dataSize, numIters) # avgIterNS = elapsedTimeNS / numIters = 10000 self.assertEqual(10000.0, avgIterNS) # algBW = dataSize / avgIterNs = 9 self.assertEqual(9.0, algBW)
def runColl(self, comm_fn=None, compute_fn=None): self.backendFuncs.complete_accel_ops(self.collectiveArgs, initOp=True) numElements = self.collectiveArgs.numElements # Initial warmup iters. for _ in range(self.collectiveArgs.numWarmupIters): if comm_fn is not None: self.collectiveArgs.waitObj.append( comm_fn(self.collectiveArgs, retFlag=self.collectiveArgs.asyncOp)) if compute_fn is not None: for _ in range(self.collectiveArgs.numComputePerColl): compute_fn(self.collectiveArgs) if not self.collectiveArgs.asyncOp: # should be sychronous, do wait. self.backendFuncs.complete_accel_ops(self.collectiveArgs) self.backendFuncs.complete_accel_ops( self.collectiveArgs ) # should be done regardless of blocking or non-blocking. self.backendFuncs.barrier(self.collectiveArgs, "runcoll") # Measuring time. start = time.monotonic() # available only in py3 for _ in range(self.collectiveArgs.numIters): if comm_fn is not None: self.collectiveArgs.waitObj.append( comm_fn(self.collectiveArgs, retFlag=self.collectiveArgs.asyncOp)) if compute_fn is not None: for _ in range(self.collectiveArgs.numComputePerColl): # TODO: investigate the cache effect # Flush the cache # _ = torch.rand(6 * 1024 * 1024 // 4).float() * 2 # V100 6MB L2 cache compute_fn(self.collectiveArgs) if not self.collectiveArgs.asyncOp: # should be sychronous, do wait. self.backendFuncs.complete_accel_ops(self.collectiveArgs) self.backendFuncs.complete_accel_ops(self.collectiveArgs) end = time.monotonic() # available only in py3 x = self.collectiveArgs.opTensor[ numElements - 1].item() # to ensure collective won't be optimized away. elapsedTimeNS = ( end - start ) * 1e9 # keeping time in NS, helps in divising data by nanoseconds avgIterNS, algBW = comms_utils.getAlgBW(elapsedTimeNS, self.collectiveArgs.dataSize, self.collectiveArgs.numIters) busBW = self.backendFuncs.getBusBW(self.collectiveArgs.collective, algBW, self.collectiveArgs.world_size) memSize = self.backendFuncs.get_mem_size(self.collectiveArgs) self.backendFuncs.barrier(self.collectiveArgs, "runcoll2") return (avgIterNS, algBW, busBW, memSize, x)
def reportBenchTime(self, commsParams, allSizes, tensorList, results): self.collectiveArgs.collective = commsParams.collective self.collectiveArgs.numIters = 1 # commsParams.numIters print( "\n\tCOMMS-RES\tsize (B)\t num-elements\t Latency(us):p50\tp75\t\tp95\t algBW(GB/s)\t busBW(GB/s)" ) for idx, curSize in enumerate(allSizes): if commsParams.backend == "xla": latencyAcrossRanks = torch.transpose( tensorList.view(-1, len(allSizes)), 0, 1)[idx] latencyAcrossRanks = latencyAcrossRanks.cpu().detach().numpy() else: latencyAcrossRanks = [] for curRankTensor in tensorList: rank_lat = curRankTensor[idx].item() latencyAcrossRanks.append(rank_lat) latencyAcrossRanks = np.array(latencyAcrossRanks) # print("AAA lat size ", curSize, " time ", latencyAcrossRanks) p50 = np.percentile(latencyAcrossRanks, 50) p75 = np.percentile(latencyAcrossRanks, 75) p95 = np.percentile(latencyAcrossRanks, 95) self.collectiveArgs.dataSize = curSize avgIterNS, algBW = comms_utils.getAlgBW( p50 * 1e3, self.collectiveArgs.dataSize, self.collectiveArgs.numIters) busBW = self.backendFuncs.getBusBW(self.collectiveArgs.collective, algBW, self.collectiveArgs.world_size) print("\tCOMMS-RES\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s" % ( results[curSize]["memSize"], str("%d" % (results[curSize]["num_elements"])), str("%.1f" % (p50)), str("%.1f" % (p75)), str("%.1f" % (p95)), str("%.3f" % (algBW)), str("%.3f" % (busBW)), ))
def reportBenchTime(collectiveArgs, commsParams, allSizes, tensorList, results): collectiveArgs.collective = commsParams.collective collectiveArgs.numIters = 1 # commsParams.numIters print( "\n\tCOMMS-RES\tsize (B)\t num-elements\t Latency(us):p50\tp75\t\tp95\t algBW(GB/s)\t busBW(GB/s)" ) for idx, curSize in enumerate(allSizes): latencyAcrossRanks = [] for curRankTensor in tensorList: rank_lat = curRankTensor[idx].item() latencyAcrossRanks.append(rank_lat) latencyAcrossRanks = np.array(latencyAcrossRanks) p50 = np.percentile(latencyAcrossRanks, 50) p75 = np.percentile(latencyAcrossRanks, 75) p95 = np.percentile(latencyAcrossRanks, 95) collectiveArgs.dataSize = curSize avgIterNS, algBW = comms_utils.getAlgBW(p50 * 1e3, collectiveArgs.dataSize, collectiveArgs.numIters) busBW = collectiveArgs.backendFuncs.getBusBW(collectiveArgs.collective, algBW, collectiveArgs.world_size) print("\tCOMMS-RES\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s" % ( results[curSize]["memSize"], str("%d" % (results[curSize]["num_elements"])), str("%.1f" % (p50)), str("%.1f" % (p75)), str("%.1f" % (p95)), str("%.3f" % (algBW)), str("%.3f" % (busBW)), ))
def runColl(self, comm_fn=None, compute_fn=None, comm_fn_pair=None): self.backendFuncs.complete_accel_ops(self.collectiveArgs, initOp=True) numElements = self.collectiveArgs.numElements if comm_fn_pair is not None: numElements_pair = self.collectiveArgs.numElements_pair # Initial warmup iters. for _ in range(self.collectiveArgs.numWarmupIters): if comm_fn is not None: if self.collectiveArgs.num_pgs > 1: self.collectiveArgs.group = self.collectiveArgs.groups[0] comm_fn(self.collectiveArgs) if comm_fn_pair is not None: if self.collectiveArgs.num_pgs > 1: self.collectiveArgs.group = self.collectiveArgs.groups[1] comm_fn_pair(self.collectiveArgs, pair=True) if compute_fn is not None: for _ in range(self.collectiveArgs.numComputePerColl): compute_fn(self.collectiveArgs) if not self.collectiveArgs.asyncOp: # should be sychronous, do wait. self.backendFuncs.complete_accel_ops(self.collectiveArgs) self.backendFuncs.sync_barrier(self.collectiveArgs, desc="runColl_begin") # Measuring time. elapsedTimeNS = 0.0 for _ in range(self.collectiveArgs.numIters): if not self.collectiveArgs.asyncOp: # should be sychronous, do barrier and wait for collective self.setTensorVal( self.collectiveArgs.opTensor) # reset tensor values if comm_fn_pair is not None: self.setTensorVal(self.collectiveArgs.opTensor_pair) self.backendFuncs.sync_barrier(self.collectiveArgs) oldAsyncOp = self.collectiveArgs.asyncOp round_robin_group = cycle(self.collectiveArgs.groups) if comm_fn_pair is not None: self.collectiveArgs.asyncOp = True start = time.monotonic() # available only in py3 if comm_fn is not None: self.collectiveArgs.group = next(round_robin_group) comm_fn(self.collectiveArgs) if comm_fn_pair is not None: self.collectiveArgs.group = next(round_robin_group) comm_fn_pair(self.collectiveArgs, pair=True) if compute_fn is not None: for _ in range(self.collectiveArgs.numComputePerColl): # TODO: investigate the cache effect # Flush the cache # _ = torch.rand(6 * 1024 * 1024 // 4).float() * 2 # V100 6MB L2 cache compute_fn(self.collectiveArgs) self.collectiveArgs.asyncOp = oldAsyncOp if not self.collectiveArgs.asyncOp: # should be sychronous, wait for the collective self.backendFuncs.complete_accel_ops(self.collectiveArgs) # Measuring time. elapsedTimeNS += ( time.monotonic() - start ) * 1e9 # keeping time in NS, helps in divising data by nanosecond start = time.monotonic() # available only in py3 self.backendFuncs.complete_accel_ops(self.collectiveArgs) end = time.monotonic() # available only in py3 if isinstance(self.collectiveArgs.opTensor, list): # allgather is a list of tensors x = self.collectiveArgs.opTensor[-1][-1].item( ) # to ensure collective won't be optimized away. else: x = self.collectiveArgs.opTensor[ numElements - 1].item() # to ensure collective won't be optimized away. x_pair = None if comm_fn_pair is not None: if isinstance(self.collectiveArgs.opTensor_pair, list): # allgather is a list of tensors x_pair = self.collectiveArgs.opTensor_pair[-1][-1].item( ) # to ensure collective won't be optimized away. else: x_pair = self.collectiveArgs.opTensor_pair[ numElements_pair - 1].item() # to ensure collective won't be optimized away. elapsedTimeNS += ( end - start ) * 1e9 # keeping time in NS, helps in divising data by nanoseconds memSize = self.backendFuncs.get_mem_size(self.collectiveArgs) avgIterNS, algBW = comms_utils.getAlgBW(elapsedTimeNS, memSize, self.collectiveArgs.numIters) busBW = self.backendFuncs.getBusBW(self.collectiveArgs.collective, algBW, self.collectiveArgs.world_size) if comm_fn_pair is not None: memSize_pair = self.backendFuncs.get_mem_size(self.collectiveArgs, pair=True) memSize += memSize_pair _, algBW_pair = comms_utils.getAlgBW(elapsedTimeNS, memSize_pair, self.collectiveArgs.numIters) algBW += algBW_pair busBW_pair = self.backendFuncs.getBusBW( self.collectiveArgs.collective_pair, algBW_pair, self.collectiveArgs.world_size) busBW += busBW_pair self.backendFuncs.sync_barrier(self.collectiveArgs, "runColl_end") return (avgIterNS, algBW, busBW, memSize, x, x_pair)
def runColl(self, comm_fn=None, compute_fn=None, comm_fn_pair=None): self.backendFuncs.complete_accel_ops(self.collectiveArgs, initOp=True) self.backendFuncs.sync_barrier(self.collectiveArgs, desc="runColl_begin") elapsedTimeNS = 0.0 is_blocking = not self.collectiveArgs.asyncOp enable_comms = False if ( comm_fn is None or comm_fn == self.backendFuncs.noop) else True enable_compute = False if (compute_fn is None or compute_fn == self.backendFuncs.noop) else True enable_comms_pair = False if (comm_fn_pair is None or comm_fn_pair == self.backendFuncs.noop) else True # for comms pair mode, force async comms for overlapping evaluation if enable_comms_pair: self.collectiveArgs.asyncOp = True for nIter in range(self.collectiveArgs.numWarmupIters + self.collectiveArgs.numIters): if nIter == self.collectiveArgs.numWarmupIters: # Start measuring time after warmup iterations elapsedTimeNS = 0.0 self.collectiveArgs.quant_time.reset() self.collectiveArgs.dequant_time.reset() # reset tensor values for data validation check if enable_comms: self.setTensorVal(self.collectiveArgs.opTensor) # for blocking mode, do barrier before starting collective if is_blocking: self.backendFuncs.sync_barrier(self.collectiveArgs) start = time.monotonic() # available only in py3 self.collectiveArgs.group = self.backendFuncs.get_next_group() comm_fn(self.collectiveArgs) # post another collecitve if on comms pair mode, otherwise it's noop self.collectiveArgs.group = self.backendFuncs.get_next_group() comm_fn_pair(self.collectiveArgs, pair=enable_comms_pair) if enable_compute: for _ in range(self.collectiveArgs.numComputePerColl): # TODO: investigate the cache effect # Flush the cache # _ = torch.rand(6 * 1024 * 1024 // 4).float() * 2 # V100 6MB L2 cache compute_fn(self.collectiveArgs) if is_blocking: # should be sychronous, wait for the collective self.backendFuncs.complete_accel_ops(self.collectiveArgs) # Measuring time. elapsedTimeNS += ( time.monotonic() - start ) * 1e9 # keeping time in NS, helps in divising data by nanosecond start = time.monotonic() # available only in py3 self.backendFuncs.complete_accel_ops(self.collectiveArgs) end = time.monotonic() # available only in py3 ensureTensorFlush(self.collectiveArgs.opTensor) if enable_comms_pair: ensureTensorFlush(self.collectiveArgs.opTensor_pair) elapsedTimeNS += ( end - start ) * 1e9 # keeping time in NS, helps in divising data by nanoseconds memSize = self.backendFuncs.get_mem_size(self.collectiveArgs) avgIterNS, algBW = comms_utils.getAlgBW(elapsedTimeNS, memSize, self.collectiveArgs.numIters) busBW = self.backendFuncs.getBusBW( self.collectiveArgs.collective, algBW, self.collectiveArgs, ) if enable_comms_pair: memSize_pair = self.backendFuncs.get_mem_size( self.collectiveArgs, pair=enable_comms_pair) memSize += memSize_pair _, algBW_pair = comms_utils.getAlgBW(elapsedTimeNS, memSize_pair, self.collectiveArgs.numIters) algBW += algBW_pair busBW += self.backendFuncs.getBusBW( self.collectiveArgs.collective_pair, algBW_pair, self.collectiveArgs, ) self.backendFuncs.sync_barrier(self.collectiveArgs, desc="runColl_end") results = { "timeUS": avgIterNS / 1e3, "algBW": algBW, "busBW": busBW, "memSize": memSize, } return results