def MetropolisCuda(sigma, T, J, B, iterations, ParaStyle, Alu, Device): # Avec PyCUDA autoinit, rien a faire ! sigmaCU = cuda.InOut(sigma) mod = SourceModule(KERNEL_CODE_CUDA) MetropolisCU = mod.get_function("MainLoopOne") start = pycuda.driver.Event() stop = pycuda.driver.Event() SizeX, SizeY = sigma.shape start.record() start.synchronize() MetropolisCU(sigmaCU, numpy.float32(T), numpy.float32(J), numpy.float32(B), numpy.uint32(SizeX), numpy.uint32(SizeY), numpy.uint32(iterations), numpy.uint32(nprnd(2**31 - 1)), numpy.uint32(nprnd(2**31 - 1)), grid=(1, 1), block=(1, 1, 1)) print "%s with %i %s done" % (Alu, 1, ParaStyle) stop.record() stop.synchronize() #elapsed = stop.time_since(start)*1e-3 elapsed = start.time_till(stop) * 1e-3 return (elapsed)
def MetropolisOpenCL(sigma, T, J, B, iterations, ParaStyle, Alu, Device): # Initialisation des variables en les CASTant correctement # Je detecte un peripherique GPU dans la liste des peripheriques # for platform in cl.get_platforms(): # for device in platform.get_devices(): # if cl.device_type.to_string(device.type)=='GPU': # GPU=device #print "GPU detected: ",device.name HasGPU = False Id = 1 # Primary Device selection based on Device Id for platform in cl.get_platforms(): for device in platform.get_devices(): #deviceType=cl.device_type.to_string(device.type) deviceType = "xPU" if Id == Device and not HasGPU: GPU = device print "CPU/GPU selected: ", device.name HasGPU = True Id = Id + 1 # Je cree le contexte et la queue pour son execution # ctx = cl.create_some_context() ctx = cl.Context([GPU]) queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # Je recupere les flag possibles pour les buffers mf = cl.mem_flags # Attention au CAST ! C'est un int8 soit un char en OpenCL ! sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=sigma) MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \ options = "-cl-mad-enable -cl-fast-relaxed-math") SizeX, SizeY = sigma.shape if ParaStyle == 'Blocks': # Call OpenCL kernel # (1,) is Global work size (only 1 work size) # (1,) is local work size CLLaunch = MetropolisCL.MainLoopOne(queue, (1, ), None, sigmaCL, numpy.float32(T), numpy.float32(J), numpy.float32(B), numpy.uint32(SizeX), numpy.uint32(SizeY), numpy.uint32(iterations), numpy.uint32(nprnd(2**31 - 1)), numpy.uint32(nprnd(2**31 - 1))) print "%s with %i %s done" % (Alu, 1, ParaStyle) else: # en OpenCL, necessaire de mettre un Global_id identique au local_id CLLaunch = MetropolisCL.MainLoopOne(queue, (1, ), (1, ), sigmaCL, numpy.float32(T), numpy.float32(J), numpy.float32(B), numpy.uint32(SizeX), numpy.uint32(SizeY), numpy.uint32(iterations), numpy.uint32(nprnd(2**31 - 1)), numpy.uint32(nprnd(2**31 - 1))) print "%s with %i %s done" % (Alu, 1, ParaStyle) CLLaunch.wait() cl.enqueue_copy(queue, sigma, sigmaCL).wait() elapsed = 1e-9 * (CLLaunch.profile.end - CLLaunch.profile.start) sigmaCL.release() return (elapsed)
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device, RNG,ValueType): # Initialisation des variables en les CASTant correctement if Device==0: print "Enter XPU selector based on ALU type: first selected" HasXPU=False # Default Device selection based on ALU Type for platform in cl.get_platforms(): for device in platform.get_devices(): deviceType=cl.device_type.to_string(device.type) if deviceType=="GPU" and Alu=="GPU" and not HasXPU: XPU=device print "GPU selected: ",device.name HasXPU=True if deviceType=="CPU" and Alu=="CPU" and not HasXPU: XPU=device print "CPU selected: ",device.name HasXPU=True else: print "Enter XPU selector based on device number & ALU type" Id=1 HasXPU=False # Primary Device selection based on Device Id for platform in cl.get_platforms(): for device in platform.get_devices(): deviceType=cl.device_type.to_string(device.type) if Id==Device and Alu==deviceType and HasXPU==False: XPU=device print "CPU/GPU selected: ",device.name.lstrip() HasXPU=True Id=Id+1 if HasXPU==False: print "No XPU #%i of type %s found in all of %i devices, sorry..." % \ (Device,Alu,Id-1) return(0,0,0) # Je cree le contexte et la queue pour son execution ctx = cl.Context([XPU]) queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # Je recupere les flag possibles pour les buffers mf = cl.mem_flags circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle) MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \ options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s" % (Marsaglia[RNG],Computing[ValueType])) i=0 MyPi=numpy.zeros(steps) MyDuration=numpy.zeros(steps) if iterations%jobs==0: iterationsCL=numpy.uint64(iterations/jobs) iterationsNew=numpy.uint64(iterationsCL*jobs) else: iterationsCL=numpy.uint64(iterations/jobs+1) iterationsNew=numpy.uint64(iterations) for i in range(steps): if ParaStyle=='Blocks': # Call OpenCL kernel # (1,) is Global work size (only 1 work size) # (1,) is local work size # circleCL is lattice translated in CL format # SeedZCL is lattice translated in CL format # SeedWCL is lattice translated in CL format # step is number of iterations CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None, circleCL, numpy.uint64(iterationsCL), numpy.uint32(nprnd(2**30/jobs)), numpy.uint32(nprnd(2**30/jobs))) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,jobs,1,ParaStyle) elif ParaStyle=='Hybrid': threads=BestThreadsNumber(jobs) # en OpenCL, necessaire de mettre un Global_id identique au local_id CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,), circleCL, numpy.uint64(iterationsCL), numpy.uint32(nprnd(2**30/jobs)), numpy.uint32(nprnd(2**30/jobs))) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,jobs/threads,threads,ParaStyle) else: # en OpenCL, necessaire de mettre un Global_id identique au local_id CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,), circleCL, numpy.uint64(iterationsCL), numpy.uint32(nprnd(2**30/jobs)), numpy.uint32(nprnd(2**30/jobs))) print "%s with %i %s done" % (Alu,jobs,ParaStyle) CLLaunch.wait() cl.enqueue_copy(queue, circle, circleCL).wait() elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start) print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle) MyDuration[i]=elapsed AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32) MyPi[i]=numpy.median(AllPi) print MyPi[i],numpy.std(AllPi),MyDuration[i] circleCL.release() print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration) return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration))
# Je cree le contexte et la queue pour son execution try: ctx = cl.Context([XPU]) queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE) except: print "Crash during context creation" MyRoutines = cl.Program(ctx, BlobOpenCL).build() mf = cl.mem_flags clData = cl.Buffer(ctx, mf.READ_WRITE, MyData.nbytes) print 'Tous au meme endroit',MyData MyRoutines.SplutterPoints(queue,(Number,1),None,clData,np.float32(SizeOfBox-LengthOfSegment),np.uint32(nprnd(2**32)),np.uint32(nprnd(2**32))) cl.enqueue_copy(queue, MyData, clData) print 'Tous distribues',MyData MyRoutines.ExtendSegment(queue,(Number,1),None,clData,np.float32(LengthOfSegment),np.uint32(nprnd(2**32)),np.uint32(nprnd(2**32))) cl.enqueue_copy(queue, MyData, clData) print 'Tous avec leur extremite',MyData MySize = np.zeros(len(MyData), dtype=np.float32) clSize = cl.Buffer(ctx, mf.READ_WRITE, MySize.nbytes) MyRoutines.EstimateLength(queue, (Number,1), None, clData, clSize)
def MetropolisAllCuda(sigmaDict, TList, J, B, iterations, jobs, ParaStyle, Alu, Device): # sigmaDict & Tlist are NOT respectively array & float # sigmaDict : dict of array for each temperatoire # TList : list of temperatures # Avec PyCUDA autoinit, rien a faire ! mod = SourceModule(KERNEL_CODE_CUDA) MetropolisBlocksCuda = mod.get_function("MainLoopGlobal") MetropolisThreadsCuda = mod.get_function("MainLoopLocal") MetropolisHybridCuda = mod.get_function("MainLoopHybrid") # Concatenate all sigma in single array sigma = numpy.copy(sigmaDict[TList[0]]) for T in TList[1:]: sigma = numpy.concatenate((sigma, sigmaDict[T]), axis=1) sigmaCU = cuda.InOut(sigma) TCU = cuda.InOut(TList) SizeX, SizeY = sigmaDict[TList[0]].shape start = pycuda.driver.Event() stop = pycuda.driver.Event() start.record() start.synchronize() if ParaStyle == 'Blocks': # Call CUDA kernel # (1,) is Global work size (only 1 work size) # (1,) is local work size # SeedZCL is lattice translated in CL format # SeedWCL is lattice translated in CL format # step is number of iterations MetropolisBlocksCuda(sigmaCU, TCU, numpy.float32(J), numpy.float32(B), numpy.uint32(SizeX), numpy.uint32(SizeY), numpy.uint32(iterations), numpy.uint32(nprnd(2**31 - 1)), numpy.uint32(nprnd(2**31 - 1)), grid=(jobs, 1), block=(1, 1, 1)) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,jobs,1,ParaStyle) elif ParaStyle == 'Threads': MetropolisThreadsCuda(sigmaCU, TCU, numpy.float32(J), numpy.float32(B), numpy.uint32(SizeX), numpy.uint32(SizeY), numpy.uint32(iterations), numpy.uint32(nprnd(2**31 - 1)), numpy.uint32(nprnd(2**31 - 1)), grid=(1, 1), block=(jobs, 1, 1)) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,1,jobs,ParaStyle) else: threads = BestThreadsNumber(jobs) MetropolisHybridCuda(sigmaCU, TCU, numpy.float32(J), numpy.float32(B), numpy.uint32(SizeX), numpy.uint32(SizeY), numpy.uint32(iterations), numpy.uint32(nprnd(2**31 - 1)), numpy.uint32(nprnd(2**31 - 1)), grid=(jobs / threads, 1), block=(threads, 1, 1)) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,jobs/threads,threads,ParaStyle) stop.record() stop.synchronize() elapsed = start.time_till(stop) * 1e-3 results = numpy.split(sigma, len(TList), axis=1) for T in TList: sigmaDict[T] = numpy.copy(results[numpy.nonzero(TList == T)[0][0]]) return (elapsed)
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,RNG,ValueType): # Avec PyCUDA autoinit, rien a faire ! circleCU = cuda.InOut(circle) try: mod = SourceModule(KERNEL_CODE_CUDA,options=['--compiler-options','-Wall -DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])]) except: print "Compilation seems to brake" MetropolisBlocksCU=mod.get_function("MainLoopBlocks") MetropolisJobsCU=mod.get_function("MainLoopThreads") MetropolisHybridCU=mod.get_function("MainLoopHybrid") start = pycuda.driver.Event() stop = pycuda.driver.Event() MyPi=numpy.zeros(steps) MyDuration=numpy.zeros(steps) if iterations%jobs==0: iterationsCL=numpy.uint64(iterations/jobs) iterationsNew=iterationsCL*jobs else: iterationsCL=numpy.uint64(iterations/jobs+1) iterationsNew=iterations for i in range(steps): start.record() start.synchronize() if ParaStyle=='Blocks': MetropolisBlocksCU(circleCU, numpy.uint64(iterationsCL), numpy.uint32(nprnd(2**30/jobs)), numpy.uint32(nprnd(2**30/jobs)), grid=(jobs,1), block=(1,1,1)) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,jobs,1,ParaStyle) elif ParaStyle=='Hybrid': threads=BestThreadsNumber(jobs) MetropolisHybridCU(circleCU, numpy.uint64(iterationsCL), numpy.uint32(nprnd(2**30/jobs)), numpy.uint32(nprnd(2**30/jobs)), grid=(jobs,1), block=(threads,1,1)) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,jobs/threads,threads,ParaStyle) else: MetropolisJobsCU(circleCU, numpy.uint64(iterationsCL), numpy.uint32(nprnd(2**30/jobs)), numpy.uint32(nprnd(2**30/jobs)), grid=(1,1), block=(jobs,1,1)) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,jobs,1,ParaStyle) stop.record() stop.synchronize() elapsed = start.time_till(stop)*1e-3 MyDuration[i]=elapsed AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32) MyPi[i]=numpy.median(AllPi) print MyPi[i],numpy.std(AllPi),MyDuration[i] print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration) return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration))
LAPIMAGE = False sigmaIn = numpy.where(numpy.random.randn(Size, Size) > 0, 1, -1).astype(numpy.int32) ImageOutput(sigmaIn, "Ising2D_Serial_%i_Initial" % (Size)) E = [] M = [] for T in Trange: # Indispensable d'utiliser copy : [:] ne fonctionne pas avec numpy ! sigma = numpy.copy(sigmaIn) # duration=Metropolis(sigma,J,B,T,Iterations) SeedW, SeedZ = numpy.int32(nprnd(2**31 - 1)), numpy.int32( nprnd(2**31 - 1)) start = time.time() array_module_np.array_metropolis_np(sigma, J, B, T, Iterations, SeedW, SeedZ) duration = time.time() - start E = numpy.append(E, Energy(sigma, J)) M = numpy.append(M, Magnetization(sigma, B)) ImageOutput(sigma, "Ising2D_Serial_%i_%1.1f_Final" % (Size, T)) print "CPU Time : %f" % (duration) print "Total Energy at Temperature %f : %f" % (T, E[-1]) print "Total Magnetization at Temperature %f : %f" % (T, M[-1]) if Curves: DisplayCurves(Trange, E, M, J, B)
def MetropolisAllOpenCL(sigmaDict, TList, J, B, iterations, jobs, ParaStyle, Alu, Device): # sigmaDict & Tlist are NOT respectively array & float # sigmaDict : dict of array for each temperatoire # TList : list of temperatures # Initialisation des variables en les CASTant correctement # Je detecte un peripherique GPU dans la liste des peripheriques HasGPU = False Id = 1 # Primary Device selection based on Device Id for platform in cl.get_platforms(): for device in platform.get_devices(): #deviceType=cl.device_type.to_string(device.type) deviceType = "xPU" if Id == Device and not HasGPU: GPU = device print "CPU/GPU selected: ", device.name HasGPU = True Id = Id + 1 # Je cree le contexte et la queue pour son execution # ctx = cl.create_some_context() ctx = cl.Context([GPU]) queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # Je recupere les flag possibles pour les buffers mf = cl.mem_flags # Concatenate all sigma in single array sigma = numpy.copy(sigmaDict[TList[0]]) for T in TList[1:]: sigma = numpy.concatenate((sigma, sigmaDict[T]), axis=1) sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=sigma) TCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=TList) MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \ options = "-cl-mad-enable -cl-fast-relaxed-math") SizeX, SizeY = sigmaDict[TList[0]].shape if ParaStyle == 'Blocks': # Call OpenCL kernel # (1,) is Global work size (only 1 work size) # (1,) is local work size # SeedZCL is lattice translated in CL format # SeedWCL is lattice translated in CL format # step is number of iterations CLLaunch = MetropolisCL.MainLoopGlobal(queue, (jobs, ), None, sigmaCL, TCL, numpy.float32(J), numpy.float32(B), numpy.uint32(SizeX), numpy.uint32(SizeY), numpy.uint32(iterations), numpy.uint32(nprnd(2**31 - 1)), numpy.uint32(nprnd(2**31 - 1))) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,jobs,1,ParaStyle) elif ParaStyle == 'Threads': # It's necessary to put a Local_ID equal to a Global_ID # Jobs are to be considerated as global number of jobs to do # And to be distributed to entities # For example : # G_ID=10 & L_ID=10 : 10 Threads on 1 UC # G_ID=10 & L_ID=1 : 10 Threads on 1 UC CLLaunch = MetropolisCL.MainLoopLocal(queue, (jobs, ), (jobs, ), sigmaCL, TCL, numpy.float32(J), numpy.float32(B), numpy.uint32(SizeX), numpy.uint32(SizeY), numpy.uint32(iterations), numpy.uint32(nprnd(2**31 - 1)), numpy.uint32(nprnd(2**31 - 1))) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,1,jobs,ParaStyle) else: threads = BestThreadsNumber(jobs) # en OpenCL, necessaire de mettre un Global_id identique au local_id CLLaunch = MetropolisCL.MainLoopHybrid(queue, (jobs, ), (threads, ), sigmaCL, TCL, numpy.float32(J), numpy.float32(B), numpy.uint32(SizeX), numpy.uint32(SizeY), numpy.uint32(iterations), numpy.uint32(nprnd(2**31 - 1)), numpy.uint32(nprnd(2**31 - 1))) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,jobs/threads,threads,ParaStyle) CLLaunch.wait() cl.enqueue_copy(queue, sigma, sigmaCL).wait() elapsed = 1e-9 * (CLLaunch.profile.end - CLLaunch.profile.start) sigmaCL.release() results = numpy.split(sigma, len(TList), axis=1) for T in TList: sigmaDict[T] = numpy.copy(results[numpy.nonzero(TList == T)[0][0]]) return (elapsed)
def MetropolisOpenCL(circle, iterations, steps, jobs, ParaStyle, Alu, Device, Memory): # Initialisation des variables en les CASTant correctement MaxMemoryXPU = 0 MinMemoryXPU = 0 if Device == 0: print "Enter XPU selector based on ALU type: first selected" HasXPU = False # Default Device selection based on ALU Type for platform in cl.get_platforms(): for device in platform.get_devices(): #deviceType=cl.device_type.to_string(device.type) deviceMemory = device.max_mem_alloc_size if deviceMemory > MaxMemoryXPU: MaxMemoryXPU = deviceMemory if deviceMemory < MinMemoryXPU or MinMemoryXPU == 0: MinMemoryXPU = deviceMemory if not HasXPU: XPU = device print "XPU selected with Allocable Memory %i: %s" % ( deviceMemory, device.name) HasXPU = True MemoryXPU = deviceMemory else: print "Enter XPU selector based on device number & ALU type" Id = 1 HasXPU = False # Primary Device selection based on Device Id for platform in cl.get_platforms(): for device in platform.get_devices(): #deviceType=cl.device_type.to_string(device.type) deviceMemory = device.max_mem_alloc_size if deviceMemory > MaxMemoryXPU: MaxMemoryXPU = deviceMemory if deviceMemory < MinMemoryXPU or MinMemoryXPU == 0: MinMemoryXPU = deviceMemory if Id == Device and HasXPU == False: XPU = device print "CPU/GPU selected with Allocable Memory %i: %s" % ( deviceMemory, device.name) HasXPU = True MemoryXPU = deviceMemory Id = Id + 1 if HasXPU == False: print "No XPU #%i of type %s found in all of %i devices, sorry..." % \ (Device,Alu,Id-1) return (0, 0, 0) print "Allocable Memory is %i, between %i and %i " % ( MemoryXPU, MinMemoryXPU, MaxMemoryXPU) # Je cree le contexte et la queue pour son execution ctx = cl.Context([XPU]) queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # Je recupere les flag possibles pour les buffers mf = cl.mem_flags MetropolisCL = cl.Program(ctx, KERNEL_CODE_OPENCL).build( options="-cl-mad-enable -cl-fast-relaxed-math") MyDuration = numpy.zeros(steps) if iterations % jobs == 0: iterationsCL = numpy.uint64(iterations / jobs) else: iterationsCL = numpy.uint64(iterations / jobs + 1) iterationsNew = numpy.uint64(iterationsCL * jobs) MySplutter = numpy.zeros(steps) MaxWorks = 2**(int)(numpy.log2(MinMemoryXPU / 4)) print MaxWorks, 2**(int)(numpy.log2(MemoryXPU)) #Splutter=numpy.zeros((MaxWorks/jobs)*jobs).astype(numpy.uint32) #Splutter=numpy.zeros(jobs*16).astype(numpy.uint32) Splutter = numpy.zeros(Memory).astype(numpy.uint32) for i in range(steps): #Splutter=numpy.zeros(2**(int)(numpy.log2(MemoryXPU/4))).astype(numpy.uint32) #Splutter=numpy.zeros(1024).astype(numpy.uint32) #Splutter=numpy.zeros(jobs).astype(numpy.uint32) Splutter[:] = 0 print Splutter, len(Splutter) h2d_time = time.time() SplutterCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=Splutter) print('From Host to Device time %f' % (time.time() - h2d_time)) start_time = time.time() if ParaStyle == 'Blocks': # Call OpenCL kernel # (1,) is Global work size (only 1 work size) # (1,) is local work size # circleCL is lattice translated in CL format # SeedZCL is lattice translated in CL format # SeedWCL is lattice translated in CL format # step is number of iterations # CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None, # SplutterCL, # numpy.uint32(len(Splutter)), # numpy.uint64(iterationsCL), # numpy.uint32(nprnd(2**30/jobs)), # numpy.uint32(nprnd(2**30/jobs))) CLLaunch = MetropolisCL.SplutterGlobal( queue, (jobs, ), None, SplutterCL, numpy.uint32(len(Splutter)), numpy.uint64(iterationsCL), numpy.uint32(nprnd(2**30 / jobs)), numpy.uint32(nprnd(2**30 / jobs))) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,jobs,1,ParaStyle) elif ParaStyle == 'Hybrid': #threads=BestThreadsNumber(jobs) threads = BestThreadsNumber(256) print "print", threads # en OpenCL, necessaire de mettre un Global_id identique au local_id CLLaunch = MetropolisCL.SplutterHybrid( queue, (jobs, ), (threads, ), SplutterCL, numpy.uint32(len(Splutter)), numpy.uint64(iterationsCL), numpy.uint32(nprnd(2**30 / jobs)), numpy.uint32(nprnd(2**30 / jobs))) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,jobs/threads,threads,ParaStyle) else: # en OpenCL, necessaire de mettre un global_id identique au local_id CLLaunch = MetropolisCL.SplutterLocal( queue, (jobs, ), (jobs, ), SplutterCL, numpy.uint32(len(Splutter)), numpy.uint64(iterationsCL), numpy.uint32(nprnd(2**30 / jobs)), numpy.uint32(nprnd(2**30 / jobs))) print "%s with %i %s done" % (Alu, jobs, ParaStyle) CLLaunch.wait() d2h_time = time.time() cl.enqueue_copy(queue, Splutter, SplutterCL).wait() print('From Device to Host %f' % (time.time() - d2h_time)) # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start) elapsed = time.time() - start_time print('Elapsed compute time %f' % elapsed) MyDuration[i] = elapsed #print Splutter,sum(Splutter) #MySplutter[i]=numpy.median(Splutter) #print numpy.mean(Splutter)*len(Splutter),MySplutter[i]*len(Splutter),numpy.std(Splutter) SplutterCL.release() print jobs, numpy.mean(MyDuration), numpy.median(MyDuration), numpy.std( MyDuration) return (numpy.mean(MyDuration), numpy.median(MyDuration), numpy.std(MyDuration))
def MetropolisCuda(circle, iterations, steps, jobs, ParaStyle, Density, Memory): # Avec PyCUDA autoinit, rien a faire ! circleCU = cuda.InOut(circle) mod = SourceModule(KERNEL_CODE_CUDA) if Density == 'Dense': MetropolisBlocksCU = mod.get_function("SplutterGlobalDense") MetropolisThreadsCU = mod.get_function("SplutterLocalDense") MetropolisHybridCU = mod.get_function("SplutterHybridDense") elif Density == 'Sparse': MetropolisBlocksCU = mod.get_function("SplutterGlobalSparse") MetropolisThreadsCU = mod.get_function("SplutterLocalSparse") MetropolisHybridCU = mod.get_function("SplutterHybridSparse") else: MetropolisBlocksCU = mod.get_function("SplutterGlobal") start = pycuda.driver.Event() stop = pycuda.driver.Event() MySplutter = numpy.zeros(steps) MyDuration = numpy.zeros(steps) if iterations % jobs == 0: iterationsCL = numpy.uint64(iterations / jobs) else: iterationsCL = numpy.uint64(iterations / jobs + 1) iterationsNew = iterationsCL * jobs Splutter = numpy.zeros(jobs * 16).astype(numpy.uint32) for i in range(steps): start_time = time.time() Splutter[:] = 0 print Splutter, len(Splutter) SplutterCU = cuda.InOut(Splutter) start.record() start.synchronize() if ParaStyle == 'Blocks': MetropolisBlocksCU(SplutterCU, numpy.uint32(len(Splutter)), numpy.uint64(iterationsCL), numpy.uint32(nprnd(2**30 / jobs)), numpy.uint32(nprnd(2**30 / jobs)), grid=(jobs, 1), block=(1, 1, 1)) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,jobs,1,ParaStyle) elif ParaStyle == 'Hybrid': threads = BestThreadsNumber(jobs) MetropolisHybridCU(SplutterCU, numpy.uint32(len(Splutter)), numpy.uint64(iterationsCL), numpy.uint32(nprnd(2**30 / jobs)), numpy.uint32(nprnd(2**30 / jobs)), grid=(jobs, 1), block=(threads, 1, 1)) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,jobs/threads,threads,ParaStyle) else: MetropolisThreadsCU(SplutterCU, numpy.uint32(len(Splutter)), numpy.uint64(iterationsCL), numpy.uint32(nprnd(2**30 / jobs)), numpy.uint32(nprnd(2**30 / jobs)), grid=(1, 1), block=(jobs, 1, 1)) print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ (Alu,1,jobs,ParaStyle) stop.record() stop.synchronize() # elapsed = start.time_till(stop)*1e-3 elapsed = time.time() - start_time print Splutter, sum(Splutter) MySplutter[i] = numpy.median(Splutter) print numpy.mean(Splutter), MySplutter[i], numpy.std(Splutter) MyDuration[i] = elapsed #AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32) #MyPi[i]=numpy.median(AllPi) #print MyPi[i],numpy.std(AllPi),MyDuration[i] print jobs, numpy.mean(MyDuration), numpy.median(MyDuration), numpy.std( MyDuration) return (numpy.mean(MyDuration), numpy.median(MyDuration), numpy.std(MyDuration))
def Metropolis(sigma, J, B, T, iterations, Device, Divider): kernel_params = {'block_size': sigma.shape[0] / Divider} # Je detecte un peripherique GPU dans la liste des peripheriques Id = 1 HasXPU = False for platform in cl.get_platforms(): for device in platform.get_devices(): if Id == Device: XPU = device print "CPU/GPU selected: ", device.name.lstrip() HasXPU = True Id += 1 if HasXPU == False: print "No XPU #%i found in all of %i devices, sorry..." % (Device, Id - 1) sys.exit() ctx = cl.Context([XPU]) queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # Je recupere les flag possibles pour les buffers mf = cl.mem_flags sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=sigma) # Program based on Kernel2 MetropolisCL = cl.Program(ctx, KERNEL_CODE.substitute(kernel_params)).build() divide = Divider * Divider step = STEP / divide i = 0 duration = 0. while (step * i < iterations / divide): # Call OpenCL kernel # (Divider,Divider) is global work size # sigmaCL is lattice translated in CL format # step is number of iterations start_time = time.time() CLLaunch = MetropolisCL.MainLoop(queue, (Divider, Divider), None, sigmaCL, numpy.float32(J), numpy.float32(B), numpy.float32(T), numpy.uint32(sigma.shape[0]), numpy.uint32(step), numpy.uint32(nprnd(2**32)), numpy.uint32(nprnd(2**32))) CLLaunch.wait() # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start) elapsed = time.time() - start_time print "Iteration %i with T=%f and %i iterations in %f: " % (i, T, step, elapsed) if LAPIMAGE: cl.enqueue_copy(queue, sigma, sigmaCL).wait() checkLattice(sigma) ImageOutput(sigma, "Ising2D_GPU_Local_%i_%1.1f_%.3i_Lap" % (SIZE, T, i)) i = i + 1 duration = duration + elapsed cl.enqueue_copy(queue, sigma, sigmaCL).wait() CheckLattice(sigma) sigmaCL.release() return (duration)
def Metropolis(sigma,J,B,T,iterations,Device): # Initialisation des variables en les CASTant correctement # Je detecte un peripherique GPU dans la liste des peripheriques Id=1 HasXPU=False for platform in cl.get_platforms(): for device in platform.get_devices(): if Id==Device: XPU=device print "CPU/GPU selected: ",device.name.lstrip() HasXPU=True Id+=1 if HasXPU==False: print "No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1) sys.exit() # Je cree le contexte et la queue pour son execution ctx = cl.Context([XPU]) queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # Je recupere les flag possibles pour les buffers mf = cl.mem_flags sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=sigma) MetropolisCL = cl.Program(ctx,KERNEL_CODE).build( options = "-cl-mad-enable -cl-fast-relaxed-math") i=0 step=STEP duration=0. while (step*i < iterations): # Call OpenCL kernel # (1,) is global work size (only 1 work size) # (1,) is local work size # sigmaCL is lattice translated in CL format # step is number of iterations start_time=time.time() CLLaunch=MetropolisCL.MainLoop(queue,(1,),None, sigmaCL, numpy.float32(J),numpy.float32(B), numpy.float32(T), numpy.uint32(sigma.shape[0]), numpy.uint32(step), numpy.uint32(nprnd(2**32)), numpy.uint32(nprnd(2**32))) CLLaunch.wait() # Does not seem to work under AMD/ATI # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start) elapsed = time.time()-start_time print "Iteration %i with T=%f and %i iterations in %f: " % (i,T,step,elapsed) if LAPIMAGE: cl.enqueue_copy(queue, sigma, sigmaCL).wait() ImageOutput(sigma,"Ising2D_GPU_Global_%i_%1.1f_%.3i_Lap" % (SIZE,T,i)) i=i+1 duration=duration+elapsed cl.enqueue_copy(queue, sigma, sigmaCL).wait() sigmaCL.release() return(duration)
clPotential = cl.Buffer(ctx, mf.READ_WRITE, MyPotential.nbytes) clKinetic = cl.Buffer(ctx, mf.READ_WRITE, MyKinetic.nbytes) clCoM = cl.Buffer(ctx, mf.READ_WRITE, MyCoM.nbytes) # Write/HostPointer approach for buffering # clDataX = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyDataX) # clDataV = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyDataV) # clPotential = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyPotential) # noqa: E501 # clKinetic = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyKinetic) # clCoM = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyCoM) print("All particles superimposed.") # Set particles to RNG points if InitialRandom: seed_w = np.uint32(nprnd(2**32)) seed_z = np.uint32(nprnd(2**32)) else: seed_w = np.uint32(19710211) seed_z = np.uint32(20081010) if Shape == "Ball": MyRoutines.InBallSplutterPoints(queue, (Number, 1), None, clDataX, SizeOfShape, seed_w, seed_z) else: MyRoutines.InBoxSplutterPoints(queue, (Number, 1), None, clDataX, SizeOfShape, seed_w, seed_z) print("All particules distributed") CLLaunch = MyRoutines.CenterOfMass(queue, (1, 1), None, clDataX, clCoM,