import paddle.fluid.profiler as profiler import paddle.fluid.unique_name as nameGen from paddle.fluid import core import unittest from multiprocessing import Process import paddle.fluid.layers as layers from functools import reduce from test_collective_base import TestCollectiveRunnerBase, runtime_main paddle.enable_static() class TestCollectiveReduceScatter(TestCollectiveRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program): ring_id = 0 nranks = 2 with fluid.program_guard(main_prog, startup_program): tindata = layers.data(name="tindata", shape=[10, 1000], dtype='float32') toutdata = fluid.layers.collective._c_reducescatter( tindata, nranks) return toutdata if __name__ == "__main__": runtime_main(TestCollectiveReduceScatter, "reducescatter", 0)
shape=[10, 1000], dtype='float64', append_batch_size=False) if self.rank == 0: main_prog.global_block().append_op(type="send_v2", inputs={'X': tindata}, attrs={ 'ring_id': ring_id, 'peer': 1, 'use_calc_stream': True, 'dynamic_shape': True }) else: main_prog.global_block().append_op(type="recv_v2", outputs={'Out': tindata}, attrs={ 'peer': 0, 'ring_id': ring_id, 'dtype': tindata.dtype, 'out_shape': tindata.shape, 'use_calc_stream': True, 'dynamic_shape': True }) return tindata if __name__ == "__main__": runtime_main(TestCollectiveSendRecvDynamicShape, "sendrecv_dynamic_shape", 0)
class TestCollectiveConcat(TestCollectiveRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program): ring_id = 0 nranks = 2 with fluid.program_guard(main_prog, startup_program): tindata = layers.data(name="tindata", shape=[10, 1000], dtype='float32') toutdata = main_prog.current_block().create_var( name="outofconcat", dtype='float32', type=core.VarDesc.VarType.LOD_TENSOR, persistable=False, stop_gradient=False) main_prog.global_block().append_op(type="c_concat", inputs={'X': tindata}, attrs={ 'ring_id': ring_id, 'rank': self.rank, 'nranks': nranks }, outputs={'Out': toutdata}) return toutdata if __name__ == "__main__": runtime_main(TestCollectiveConcat, "concat", 0)
def get_model(self, main_prog, startup_program): ring_id = self.global_ring_id with fluid.program_guard(main_prog, startup_program): tindata = layers.data( name="tindata", shape=[10, 1000], dtype='float64') if self.rank == 0: main_prog.global_block().append_op( type="send_v2", inputs={'X': tindata}, attrs={ 'ring_id': ring_id, 'peer': 1, 'use_calc_stream': True }) else: main_prog.global_block().append_op( type="recv_v2", outputs={'Out': tindata}, attrs={ 'peer': 0, 'ring_id': ring_id, 'dtype': tindata.dtype, 'out_shape': tindata.shape, 'use_calc_stream': True, }) return tindata if __name__ == "__main__": runtime_main(TestCollectiveSendRecv, "sendrecv", 0)
def get_model(self, main_prog, startup_program): ring_id = 0 rootid = 1 with fluid.program_guard(main_prog, startup_program): tindata = layers.data(name="tindata", shape=[10, 1000], dtype='float32') toutdata = main_prog.current_block().create_var( name="outofreduce", dtype='float32', type=core.VarDesc.VarType.LOD_TENSOR, persistable=False, stop_gradient=False) main_prog.global_block().append_op(type="c_reduce_sum", inputs={'X': tindata}, attrs={ 'ring_id': ring_id, 'use_calc_stream': True, 'root_id': rootid }, outputs={'Out': toutdata}) main_prog.global_block().append_op(type="c_sync_comm_stream", inputs={'X': toutdata}, outputs={'Out': toutdata}, attrs={'ring_id': ring_id}) return toutdata if __name__ == "__main__": runtime_main(TestCollectiveReduce, "reduce", 0)
def get_model(self, main_prog, startup_program): ring_id = 0 nranks = 2 with fluid.program_guard(main_prog, startup_program): tindata = layers.data(name="tindata", shape=[10, 1000], dtype='float32') toutdata = main_prog.current_block().create_var( name="outofgather", dtype='float32', type=core.VarDesc.VarType.LOD_TENSOR, persistable=False, stop_gradient=False) main_prog.global_block().append_op(type="c_allgather", inputs={'X': tindata}, attrs={ 'ring_id': ring_id, 'nranks': nranks }, outputs={'Out': toutdata}) main_prog.global_block().append_op(type="c_sync_comm_stream", inputs={'X': toutdata}, outputs={'Out': toutdata}, attrs={'ring_id': ring_id}) return toutdata if __name__ == "__main__": runtime_main(TestCollectiveAllGather, "allgather", 0)
class TestCollectiveAllGather(TestCollectiveRunnerBase): def __init__(self): self.global_ring_id = 0 def get_model(self, main_prog, startup_program): ring_id = 0 nranks = 2 with fluid.program_guard(main_prog, startup_program): tindata = layers.data(name="tindata", shape=[10, 1000], dtype='float32') toutdata = main_prog.current_block().create_var( name="outofsplit", dtype='float32', type=core.VarDesc.VarType.LOD_TENSOR, persistable=False, stop_gradient=False) main_prog.global_block().append_op(type="c_split", inputs={'X': tindata}, attrs={ 'ring_id': ring_id, 'rank': self.rank, 'nranks': nranks }, outputs={'Out': toutdata}) return toutdata if __name__ == "__main__": runtime_main(TestCollectiveAllGather, "split", 0)
def get_model(self, main_prog, startup_program): ring_id = 0 rootid = 1 with fluid.program_guard(main_prog, startup_program): tindata = layers.data(name="tindata", shape=[10, 1000], dtype='float32') toutdata = main_prog.current_block().create_var( name="outofreduce", dtype='float32', type=core.VarDesc.VarType.LOD_TENSOR, persistable=False, stop_gradient=False) main_prog.global_block().append_op(type="c_scatter", inputs={'X': tindata}, attrs={ 'ring_id': ring_id, 'root': rootid, 'nranks': 2 }, outputs={'Out': toutdata}) main_prog.global_block().append_op(type="c_sync_comm_stream", inputs={'X': toutdata}, outputs={'Out': toutdata}, attrs={'ring_id': ring_id}) return toutdata if __name__ == "__main__": runtime_main(TestCollectiveScatter, "scatter", 0)
def get_model(self, main_prog, startup_program): ring_id = 0 rootid = 1 with fluid.program_guard(main_prog, startup_program): tindata = layers.data(name="tindata", shape=[10, 1000], dtype='float32') toutdata = main_prog.current_block().create_var( name="outofbroadcast", dtype='float32', type=core.VarDesc.VarType.LOD_TENSOR, persistable=False, stop_gradient=False) main_prog.global_block().append_op(type="c_broadcast", inputs={'X': tindata}, attrs={ 'ring_id': ring_id, 'root': rootid }, outputs={'Out': toutdata}) main_prog.global_block().append_op(type="c_sync_comm_stream", inputs={'X': toutdata}, outputs={'Out': toutdata}, attrs={'ring_id': ring_id}) return toutdata if __name__ == "__main__": runtime_main(TestCollectiveBroadcast, "broadcast", 0)
main_prog.global_block().append_op(type="c_wait_comm", inputs={'X': toutdata}, outputs={'Out': toutdata}, attrs={'ring_id': ring_id}) # tout = tin + tout - tin = tout if True: main_prog.global_block().append_op( type="elementwise_add", inputs={ 'X': tindata, 'Y': toutdata, }, outputs={'Out': toutdata}, ) main_prog.global_block().append_op( type="elementwise_sub", inputs={ 'X': toutdata, 'Y': tindata, }, outputs={'Out': toutdata}, ) return toutdata if __name__ == "__main__": runtime_main(TestCollectiveAllreduce, "allreduce", 0)