def _buffer_from_gpuarray(self, array): data = array.gpudata # data might be an `int` or `DeviceAllocation` if isinstance(data, cuda.DeviceAllocation): return data.as_buffer(array.nbytes) else: # construct the buffer return MPI.make_buffer(array.gpudata, array.nbytes)
def send(data, data_package, dest=None, gpu_direct=True): global s_requests tag = 52 dp = data_package # send data_package send_data_package(dp, dest=dest, tag=tag) bytes = dp.data_bytes memory_type = dp.memory_type if log_type in ['time','all']: st = time.time() flag = False request = None if memory_type == 'devptr': # data in the GPU if gpu_direct: # want to use GPU direct devptr = data buf = MPI.make_buffer(devptr.__int__(), bytes) ctx.synchronize() request = comm.Isend([buf, MPI.BYTE], dest=dest, tag=57) if VIVALDI_BLOCKING: MPI.Request.Wait(request) s_requests.append((request, buf, devptr)) flag = True else:# not want to use GPU direct # copy to CPU shape = dp.data_memory_shape dtype = dp.data_contents_memory_dtype buf = numpy.empty(shape, dtype=dtype) cuda.memcpy_dtoh_async(buf, data, stream=stream_list[1]) request = comm.Isend(buf, dest=dest, tag=57) if VIVALDI_BLOCKING: MPI.Request.Wait(request) s_requests.append((request, buf, None)) else: # data in the CPU # want to use GPU direct, not exist case # not want to use GPU direct if dp.data_dtype == numpy.ndarray: request = comm.Isend(data, dest=dest, tag=57) if VIVALDI_BLOCKING: MPI.Request.Wait(request) s_requests.append((request, data, None)) if log_type in ['time','all']: u = dp.unique_id bytes = dp.data_bytes t = MPI.Wtime()-st ms = 1000*t bw = bytes/GIGA/t if flag: log("rank%d, \"%s\", u=%d, from rank%d to rank%d GPU direct send, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, rank, dest, bytes/MEGA, ms, bw),'time', log_type) else: log("rank%d, \"%s\", u=%d, from rank%d to rank%d MPI data transfer, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec"%(rank, name, u, rank, dest, bytes/MEGA, ms, bw),'time', log_type) return request
def recv(): # DEBUG flag ################################################ RECV_CHECK = False # Implementation ################################################ data_package = comm.recv(source=source, tag=52) dp = data_package memory_type = dp.memory_type if memory_type == 'devptr': bytes = dp.data_bytes devptr, usage = malloc_with_swap_out(bytes) buf = MPI.make_buffer(devptr.__int__(), bytes) request = comm.Irecv([buf, MPI.BYTE], source=source, tag=57) if VIVALDI_BLOCKING: MPI.Request.Wait(request) return devptr, data_package, request, buf else: data_dtype = dp.data_dtype if data_dtype == numpy.ndarray: data_memory_shape = dp.data_memory_shape dtype = dp.data_contents_memory_dtype data = numpy.empty(data_memory_shape, dtype=dtype) request = comm.Irecv(data, source=source, tag=57) if RECV_CHECK: # recv check MPI.Request.Wait(request) print "RECV CHECK", data if VIVALDI_BLOCKING: MPI.Request.Wait(request) return data, data_package, request, None return None,None,None,None
def recv(): # DEBUG flag ################################################ RECV_CHECK = False # Implementation ################################################ data_package = comm.recv(source=source, tag=52) dp = data_package memory_type = dp.memory_type if memory_type == 'devptr': bytes = dp.data_bytes devptr, usage = malloc_with_swap_out(bytes) buf = MPI.make_buffer(devptr.__int__(), bytes) request = comm.Irecv([buf, MPI.BYTE], source=source, tag=57) if VIVALDI_BLOCKING: MPI.Request.Wait(request) return devptr, data_package, request, buf else: data_dtype = dp.data_dtype if data_dtype == numpy.ndarray: data_memory_shape = dp.data_memory_shape dtype = dp.data_contents_memory_dtype data = numpy.empty(data_memory_shape, dtype=dtype) request = comm.Irecv(data, source=source, tag=57) if RECV_CHECK: # recv check MPI.Request.Wait(request) print "RECV CHECK", data if VIVALDI_BLOCKING: MPI.Request.Wait(request) return data, data_package, request, None return None, None, None, None
def send(data, data_package, dest=None, gpu_direct=True): global s_requests tag = 52 dp = data_package # send data_package send_data_package(dp, dest=dest, tag=tag) bytes = dp.data_bytes memory_type = dp.memory_type if log_type in ['time', 'all']: st = time.time() flag = False request = None if memory_type: # data in the GPU if gpu_direct: # want to use GPU direct devptr = data buf = MPI.make_buffer(devptr.__int__(), bytes) request = comm.Isend([buf, MPI.BYTE], dest=dest, tag=57) if VIVALDI_BLOCKING: MPI.Request.Wait(request) s_requests.append((request, buf, devptr)) flag = True else: # not want to use GPU direct # copy to CPU shape = dp.data_memory_shape dtype = dp.data_contents_memory_dtype buf = numpy.empty(shape, dtype=dtype) cuda.memcpy_dtoh_async(buf, data, stream=stream_list[1]) request = comm.Isend(buf, dest=dest, tag=57) if VIVALDI_BLOCKING: MPI.Request.Wait(request) s_requests.append((request, buf, None)) else: # data in the CPU # want to use GPU direct, not exist case # not want to use GPU direct if dp.data_dtype == numpy.ndarray: request = comm.Isend(data, dest=dest, tag=57) if VIVALDI_BLOCKING: MPI.Request.Wait(request) s_requests.append((request, data, None)) if log_type in ['time', 'all']: u = dp.unique_id bytes = dp.data_bytes t = MPI.Wtime() - st ms = 1000 * t bw = bytes / GIGA / t if flag: log( "rank%d, \"%s\", u=%d, from rank%d to rank%d GPU direct send, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec" % (rank, name, u, rank, dest, bytes / MEGA, ms, bw), 'time', log_type) else: log( "rank%d, \"%s\", u=%d, from rank%d to rank%d MPI data transfer, Bytes: %dMB, time: %.3f ms, speed: %.3f GByte/sec" % (rank, name, u, rank, dest, bytes / MEGA, ms, bw), 'time', log_type) return request