def VecSet(show): gpu_sizes = [ 1000, 10000, 100000, 1000000, 2000000, 4000000, 6000000, 8000000, 10000000, 20000000, 40000000, 60000000, 80000000, 100000000, 1000000000 ] cpu_sizes = [1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000] cpu = [] gpu = [] for size in cpu_sizes: time1 = ut.get_time("../data/vec-ops/vec_ops.n2_g0_c21_p7." + str(size) + ".654910", "VecSet", 2) # 7 CPUs cpu.append(ut.calc_rate(size, time1)) for size in gpu_sizes: time2 = ut.get_time("../data/figures-2-7-8-9/vec_ops.n1_g1_c2_a1." + str(size) + ".668627", "VecSet", 3) # 1 GPU with 1 CPU gpu.append(ut.calc_rate(size, time2)) # plot num = 2 cm = plt.get_cmap('inferno') fig = plt.figure() ax = fig.add_subplot(111) ax.set_color_cycle([cm((1. * i) / num) for i in range(num)]) ax.plot(cpu_sizes, cpu, marker="o", markersize="4", markeredgewidth=2, label="7 CPU cores") ax.plot(gpu_sizes, gpu, marker="o", markersize="4", markeredgewidth=2, label="1 GPU") plt.title("VecSet performance", fontsize=12) plt.xlabel("Vector size", fontsize=12) plt.ylabel("8 Mbytes/second", fontsize=12) plt.legend(loc="upper left", fontsize=12, frameon=False) plt.tight_layout() plt.xscale('log') ax.set_yticklabels( ['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()]) plt.savefig("../plots/VecSet.png") if show: plt.show()
def cpu_vs_gpu_copy(show): gpu_sizes = [ 1000, 10000, 100000, 1000000, 2000000, 4000000, 6000000, 8000000, 10000000, 20000000, 40000000, 60000000, 80000000, 100000000, 1000000000 ] cpu_sizes = [1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000] gpu_VecCopy = [] cpu_VecCopy = [] gpu_ToGpu = [] for size in gpu_sizes: scale = 2 # two memory access time = ut.get_time( "../data/figures-2-7-8-9/vec_ops.n6_g1_c2_a1." + str(size) + ".668627", "VecCopy", 1) gpu_VecCopy.append(scale * ut.calc_rate(size, time)) # pinned memory scale = 1 if gpu_sizes <= 100000: run_num = ".732319" else: run_num = ".715071" time = ut.get_time( "../data/pinned/vec_ops.n6_g1_c7_a1." + str(size) + run_num, "VecCUDACopyTo", 1) gpu_ToGpu.append(scale * ut.calc_rate(size, time)) for size in cpu_sizes: scale = 2 # two memory access time = ut.get_time( "../data/cpu-flush-cache/vec_ops.n2_g0_c21_p42." + str(size), "VecCopy", 1) cpu_VecCopy.append(scale * ut.calc_rate(size, time)) # calculate peak rates in 8 Mbyes/second cpu_rate = 135 * 1e9 gpu_rate = 900 * 1e9 cpu_peak = (2 * cpu_rate) / (8 * 1e6) gpu_peak = (6 * gpu_rate) / (8 * 1e6) cpu_to_gpu_rate = 50 * 1e9 cpu_to_gpu_peak = (6 * cpu_to_gpu_rate) / (8 * 1e6) # plot fig, left = plt.subplots() right = left.twinx() cm = plt.get_cmap('inferno') right.plot(cpu_sizes, cpu_VecCopy, color=cm((1. * 2) / 4), label="42 CPU cores VecCopy") right.plot(gpu_sizes, gpu_VecCopy, color=cm((1. * 1 - 1) / 4), label="6 GPUs VecCopy") right.plot(gpu_sizes, gpu_ToGpu, color=cm((1. * 3) / 4), label="Copy to GPU") plt.plot(2000000000, gpu_peak, color=cm((1. * 1 - 1) / 4), linestyle="none", markersize="15", markeredgewidth=2, marker="_", clip_on=False) plt.plot(2000000000, cpu_peak, color=cm((1. * 2) / 4), linestyle="none", markersize="15", markeredgewidth=2, marker="_", clip_on=False) plt.plot(2000000000, cpu_to_gpu_peak, color=cm((1. * 3) / 4), linestyle="none", markersize="15", markeredgewidth=2, marker="_", clip_on=False) plt.xlim([500, 2000000000]) left.set_title("GPU vs CPU copy performance", fontsize=12) left.set_xlabel("Vector size", fontsize=12) left.set_ylabel("8 MBytes/second", fontsize=12) right.legend(loc="upper left", fontsize=12, ncol=1, frameon=False) # plt.legend(loc="upper left", fontsize=12, ncol=1, frameon=False) left.set_xscale('log') left.set_yscale('log') right.set_yscale('log') right.get_yaxis().set_visible(False) left.set_ylim(top=10000000) right.set_ylim(top=10000000) right.set_ylim(bottom=20) left.set_ylim(bottom=20) plt.tight_layout() plt.savefig("../plots/CPU_vs_GPU_copy_siampp.png") if show: plt.show()
def VecCUDACopyTo_all(show): cpus = [1]#, 2, 4] sizes = [1000, 10000, 100000]#, 1000000, 10000000, 100000000, 1000000000] data = [] data_pinned = [] data_pinned_waitforgpu = [] scale = 1 # no memory movement print "Non-pinned" bandwidth = [] for size in sizes: time = ut.get_time("../data/vec-ops/vec_ops.n1_g1_c42_a1." + str(size) + ".654911", "VecCUDACopyTo", 1) # 1 GPU with 1 CPU bandwidth.append(scale*ut.calc_rate(size, time)) data.append(bandwidth) for cpu in cpus: bandwidth = [] for size in sizes: time = ut.get_time("../data/vec-ops/vec_ops.n6_g1_c7_a" + str(cpu) + "." + str(size) + ".654914", "VecCUDACopyTo", 1) print cpu print size print time bandwidth.append(scale*ut.calc_rate(size, time)) data.append(bandwidth) print "Pinned" bandwidth = [] for size in sizes: time = ut.get_time("../data/pinned/vec_ops.n1_g1_c42_a1." + str(size) + ".713339", "VecCUDACopyTo", 1) # 1 GPU with 1 CPU bandwidth.append(scale*ut.calc_rate(size, time)) data_pinned.append(bandwidth) for cpu in cpus: bandwidth = [] for size in sizes: time = ut.get_time("../data/pinned/vec_ops.n6_g1_c7_a" + str(cpu) + "." + str(size) + ".715071", "VecCUDACopyTo", 1) print cpu print size print time bandwidth.append(scale*ut.calc_rate(size, time)) data_pinned.append(bandwidth) print "Pinned WaitForGPU()" bandwidth = [] for size in sizes: time = ut.get_time("../data/pinned/vec_ops.n1_g1_c2_a1." + str(size) + ".732318", "VecCUDACopyTo", 1) # 1 GPU with 1 CPU bandwidth.append(scale*ut.calc_rate(size, time)) data_pinned_waitforgpu.append(bandwidth) for cpu in cpus: bandwidth = [] for size in sizes: time = ut.get_time("../data/pinned/vec_ops.n6_g1_c7_a" + str(cpu) + "." + str(size) + ".732319", "VecCUDACopyTo", 1) print cpu print size print time bandwidth.append(scale*ut.calc_rate(size, time)) data_pinned_waitforgpu.append(bandwidth) # calculate peak rates in 8 Mbyes/second rate = 50*1e9 gpu1_peak = rate/(8*1e6) gpu6_peak = (6*rate)/(8*1e6) # plot labels = ["1 MPI rank and 1 GPU", "1 MPI rank per GPU"]#, "2 MPI ranks per GPU", "4 MPI ranks per GPU"] # labels_pinned = ["Pinned 1 MPI rank and 1 GPU", "Pinned 1 MPI rank per GPU", "Pinned 2 MPI ranks per GPU", "Pinned 4 MPI ranks per GPU"] num = len(labels) cm = plt.get_cmap('inferno') fig = plt.figure() ax = fig.add_subplot(111) ax.set_color_cycle([cm((1.*i)/num) for i in range(num)]) for i in range(num): ax.plot(sizes, data[i], marker="o", linestyle="dashed") ax.set_color_cycle([cm((1.*i)/num) for i in range(num)]) for i in range(num): ax.plot(sizes, data_pinned[i], marker="o", label=labels[i]) for i in range(num): ax.plot(sizes, data_pinned_waitforgpu[i], marker="o", linestyle="dotted") # plt.plot(2000000000, gpu1_peak, color="black", linestyle="none", markersize="15", markeredgewidth=2, marker="_", clip_on=False)#, label="1 GPU peak") # plt.plot(2000000000, gpu6_peak, color="black", linestyle="none", markersize="15", markeredgewidth=2, marker="_", clip_on=False)#, label="6 GPU peak") # plt.text(1700000000, gpu1_peak+1200, "1 GPU peak", horizontalalignment='right', verticalalignment='center') # plt.text(1500000000, gpu6_peak, "6 GPU peak", horizontalalignment='right', verticalalignment='center') # plt.xlim([500, 2000000000]) plt.title("CPU to GPU transfer performance", fontsize=12) plt.xlabel("Vector size", fontsize=12) plt.ylabel("8 Mbytes/second", fontsize=12) ax.legend(loc="upper left", fontsize=12, frameon=False) plt.tight_layout() plt.xscale('log') ax.set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()]) # plt.savefig("../plots/VecCUDACopyTo_all.png") if show: plt.show()
def VecCUDACopyTo_comparison(comp, show): cpus = [1, 2, 4] sizes = [1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000] sizes_ = [100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 10000000000] ones = [1, 1, 1, 1, 1, 1, 1, 1, 1] zeros = [0, 0, 0, 0, 0, 0, 0, 0, 0] data = [] data_pinned = [] scale = 1 # no memory movement bandwidth = [] for size in sizes: time = ut.get_time("../data/vec-ops/vec_ops.n1_g1_c42_a1." + str(size) + ".654911", "VecCUDACopyTo", 1) # 1 GPU with 1 CPU bandwidth1 = scale*ut.calc_rate(size, time) time = ut.get_time("../data/pinned/vec_ops.n1_g1_c42_a1." + str(size) + ".713339", "VecCUDACopyTo", 1) bandwidth2 = scale*ut.calc_rate(size, time) if comp == "_ratio": bandwidth.append(bandwidth2/bandwidth1) data.append(bandwidth) for cpu in cpus: bandwidth = [] for size in sizes: time = ut.get_time("../data/vec-ops/vec_ops.n6_g1_c7_a" + str(cpu) + "." + str(size) + ".654914", "VecCUDACopyTo", 1) bandwidth1 = scale*ut.calc_rate(size, time) time = ut.get_time("../data/pinned/vec_ops.n6_g1_c7_a" + str(cpu) + "." + str(size) + ".715071", "VecCUDACopyTo", 1) bandwidth2 = scale*ut.calc_rate(size, time) if comp == "_ratio": bandwidth.append(bandwidth2/bandwidth1) if (size == 10000 or size == 100000000) and cpu == 1: print size time = ut.get_time("../data/vec-ops/vec_ops.n6_g1_c7_a" + str(cpu) + "." + str(size) + ".654914", "VecCUDACopyTo", 1) print scale*ut.calc_rate(size, time) time = ut.get_time("../data/pinned/vec_ops.n1_g1_c42_a1." + str(size) + ".713339", "VecCUDACopyTo", 1) print scale*ut.calc_rate(size, time) data.append(bandwidth) # plot labels = ["1 MPI rank and 1 GPU", "1 MPI rank per GPU", "2 MPI ranks per GPU", "4 MPI ranks per GPU"] num = len(labels) cm = plt.get_cmap('inferno') fig = plt.figure() ax = fig.add_subplot(111) ax.set_color_cycle([cm((1.*(i+2))/(num+2)) for i in range(num)]) for i in range(num): ax.plot(sizes, data[i], marker="o", label=labels[i]) ax.plot(sizes_, ones, color="black", linestyle="dashed") plt.xlim([500, 2000000000]) plt.title("CPU to GPU transfer performance", fontsize=12) plt.xlabel("Vector size", fontsize=12) plt.ylabel("Pinned memory/non-pinned memory", fontsize=12) ax.legend(loc="upper left", fontsize=12, frameon=False) plt.tight_layout() plt.xscale('log') # plt.savefig("../plots/VecCUDACopyTo_ratio.png") if show: plt.show()
def cpu_vs_gpu(operation, count, show): gpu_sizes = [ 1000, 10000, 100000, 1000000, 2000000, 4000000, 6000000, 8000000, 10000000, 20000000, 40000000, 60000000, 80000000, 100000000, 1000000000 ] cpu_sizes = [1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000] gpu = [] cpu = [] gpu_VecCopy = [] cpu_VecCopy = [] gpu_ToGpu = [] if operation == "VecDot": mem_scale = 1 elif operation == "VecAXPY": mem_scale = 1.5 for size in gpu_sizes: if operation == "VecDot": scale = 1 gpu.append(scale * float( ut.get_floprate( "../data/waitforgpu/vec_ops.n6_g1_c2_a1." + str(size) + ".718559", operation, False, count))) # need to get this data elif operation == "VecAXPY": scale = 1 gpu.append(scale * float( ut.get_floprate( "../data/figures-2-7-8-9/vec_ops.n6_g1_c2_a1." + str(size) + ".668627", operation, False, count))) # need to get this data scale = 2 # two memory access time = ut.get_time( "../data/figures-2-7-8-9/vec_ops.n6_g1_c2_a1." + str(size) + ".668627", "VecCopy", 1) gpu_VecCopy.append(scale * ut.calc_rate(size, time)) # pinned memory scale = 1 if gpu_sizes <= 100000: run_num = ".732319" else: run_num = ".715071" time = ut.get_time( "../data/pinned/vec_ops.n6_g1_c7_a1." + str(size) + run_num, "VecCUDACopyTo", 1) gpu_ToGpu.append(scale * ut.calc_rate(size, time)) for size in cpu_sizes: if operation == "VecDot": scale = 1 elif operation == "VecAXPY": scale = 1 # cpu.append(scale*float(ut.get_floprate("../data/vec-ops/vec_ops.n2_g0_c21_p42." + str(size) + ".654910", operation, True, count))) cpu.append(scale * float( ut.get_floprate( "../data/cpu-flush-cache/vec_ops.n2_g0_c21_p42." + str(size), operation, True, count))) # cpu.append(scale*float(ut.get_floprate("../data/cpu-no-flush-cache/vec_ops.n2_g0_c21_p42." + str(size) + ".767590", operation, True, count))) scale = 2 # two memory access # time = ut.get_time("../data/vec-ops/vec_ops.n2_g0_c21_p42." + str(size) + ".654910", "VecCopy", 1) time = ut.get_time( "../data/cpu-flush-cache/vec_ops.n2_g0_c21_p42." + str(size), "VecCopy", 1) # time = ut.get_time("../data/cpu-no-flush-cache/vec_ops.n2_g0_c21_p42." + str(size) + ".767590", "VecCopy", 1) cpu_VecCopy.append(scale * ut.calc_rate(size, time)) # calculate peak rates in 8 Mbyes/second cpu_rate = 135 * 1e9 gpu_rate = 900 * 1e9 cpu_peak = (2 * cpu_rate) / (8 * 1e6) gpu_peak = (6 * gpu_rate) / (8 * 1e6) cpu_to_gpu_rate = 50 * 1e9 cpu_to_gpu_peak = (6 * cpu_to_gpu_rate) / (8 * 1e6) # plot fig, left = plt.subplots() right = left.twinx() cm = plt.get_cmap('inferno') left.plot(cpu_sizes, cpu, color=cm((1. * 2) / 4), label="42 CPU cores " + operation) right.plot(cpu_sizes, cpu_VecCopy, color=cm((1. * 2) / 4), linestyle="dashed", label="42 CPU cores VecCopy") right.plot(gpu_sizes, gpu_ToGpu, color=cm((1. * 3) / 4), linestyle="dashed", label="6 GPUs copy to GPU") left.plot(gpu_sizes, gpu, color=cm((1. * 1 - 1) / 4), label="6 GPUs " + operation) right.plot(gpu_sizes, gpu_VecCopy, color=cm((1. * 1 - 1) / 4), linestyle="dashed", label="6 GPUs VecCopy") plt.plot(2000000000, gpu_peak, color=cm((1. * 1 - 1) / 4), linestyle="none", markersize="15", markeredgewidth=2, marker="_", label="GPU copy peak", clip_on=False) plt.plot(2000000000, cpu_to_gpu_peak, color=cm((1. * 3) / 4), linestyle="none", markersize="15", markeredgewidth=2, marker="_", label="CPU to GPU peak", clip_on=False) plt.plot(2000000000, cpu_peak, color=cm((1. * 2) / 4), linestyle="none", markersize="15", markeredgewidth=2, marker="_", label="CPU copy peak", clip_on=False) plt.xlim([500, 2000000000]) left.set_title("GPU vs CPU " + operation + " performance", fontsize=12) left.set_xlabel("Vector size", fontsize=12) left.set_ylabel("MFlops/second", fontsize=12) right.set_ylabel("8 MBytes/second", fontsize=12) left.legend(loc="upper left", fontsize=12, ncol=1, frameon=False) plt.legend(loc="lower right", fontsize=12, ncol=1, frameon=False) left.set_xscale('log') left.set_yscale('log') right.set_yscale('log') left.set_ylim(top=10000000) right.set_ylim(top=10000000 * mem_scale) right.set_ylim(bottom=20) left.set_ylim(bottom=20) plt.tight_layout() # plt.savefig("../plots/" + operation + "_CPU_vs_GPU.png") if show: plt.show()
def synthetic_latency(operation, count, show): # get data sizes = [1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000] gpu = [] cpu = [] gpu_VecCopy = [] cpu_VecCopy = [] gpu_ToGpu = [] gpu_16 = [] gpu_28_time = [] gpu_24 = [] gpu_28 = [] for size in sizes: # floprate from file floprate = float( ut.get_floprate( "../data/vec-ops/vec_ops.n6_g1_c7_a1." + str(size) + ".654914", operation, False, count)) gpu.append(floprate) # time from file time = ut.get_time( "../data/vec-ops/vec_ops.n6_g1_c7_a1." + str(size) + ".654914", operation, count) # create synthetic floprates if operation == "VecAXPY": labels = ["16", "24", "28"] gpu_16.append((2 * size * 1e-6) / (time - 16e-6)) gpu_24.append((2 * size * 1e-6) / (time - 24e-6)) gpu_28.append((2 * size * 1e-6) / (time - 28e-6)) elif operation == "VecDot": labels = ["16", "24", "50"] gpu_16.append((2 * size * 1e-6) / (time - 16e-6)) gpu_24.append((2 * size * 1e-6) / (time - 24e-6)) gpu_28.append((2 * size * 1e-6) / (time - 50e-6)) # VecDot bigger latencies in data # other operations cpu.append( float( ut.get_floprate( "../data/vec-ops/vec_ops.n2_g0_c21_p42." + str(size) + ".654910", operation, True, count))) time = ut.get_time( "../data/vec-ops/vec_ops.n6_g1_c7_a1." + str(size) + ".654914", "VecCopy", 1) gpu_VecCopy.append(ut.calc_rate(size, time)) time = ut.get_time( "../data/vec-ops/vec_ops.n2_g0_c21_p42." + str(size) + ".654910", "VecCopy", 1) cpu_VecCopy.append(ut.calc_rate(size, time)) time = ut.get_time( "../data/vec-ops/vec_ops.n6_g1_c7_a1." + str(size) + ".654914", "VecCUDACopyTo", 1) gpu_ToGpu.append(ut.calc_rate(size, time)) # plot plt.plot(sizes, cpu, color="grey", alpha=0.5, marker=".", markersize="6", markeredgewidth=2, label="42 CPUs " + operation) plt.plot(sizes, gpu, color="black", marker=".", markersize="6", markeredgewidth=2, label="6 GPUs " + operation) plt.plot(sizes, gpu_24, color="black", marker=".", markersize="6", markeredgewidth=2, linestyle="dotted", label="$" + labels[1] + "\cdot10^{-6}$ latency") plt.title(operation + " performance without calculated latency", fontsize=12) plt.xlabel("Vector size", fontsize=12) plt.ylabel("MFlops/second", fontsize=12) plt.legend(loc="lower right", fontsize=12, frameon=False) plt.xscale('log') plt.yscale('log') plt.ylim(top=1000000) plt.tight_layout() plt.savefig("../plots/" + operation + "_synthetic_latency.png") if show: plt.show() plt.gcf().clear()
hs.append(numpy.mean([c.h() for c in cells(mesh)])) print('Compute the solution for n={}...'.format(n)) g = interpolate(gexp, FunctionSpace(mesh, 'Regge', degree)) h = min([c.inradius() for c in cells(mesh)]) / 2.0 (_, solh) = exponential_map(g, 0, q0, p0, h, t2s(Tmax), verbose=True) print('Evaluate the solution and compute the error...') t = numpy.linspace(0, Tmax, 200 * M + 1) s = t2s(t) qe = sol(t) (qh, ph) = solh(s) (Hh, Lh) = integrals(qh, ph) d = qe - qh ee.append(numpy.max(numpy.sqrt(numpy.array([q.dot(q) for q in d])))) eH.append(numpy.max(numpy.abs(Hh - H))) eL.append(numpy.max(numpy.abs(Lh - L))) # compute error rates and output prefix = 'conv-deg{}-'.format(degree) print('') print('===DEGREE {}==='.format(degree)) rate_plot(hs, ee, name=prefix + 'e') print(calc_rate(hs, ee)) rate_plot(hs, eH, name=prefix + 'H') print(calc_rate(hs, eH)) rate_plot(hs, eL, name=prefix + 'L') print(calc_rate(hs, eL)) print('')
def jed_cpu_vs_gpu(operation, count, show): gpu_sizes = [ 1000, 10000, 100000, 1000000, 2000000, 4000000, 6000000, 8000000, 10000000, 20000000, 40000000, 60000000, 80000000, 100000000, 1000000000 ] cpu_sizes = [1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000] gpu = [] cpu = [] gpu_time = [] cpu_time = [] gpu_VecCopy = [] cpu_VecCopy = [] gpu_VecCopy_time = [] cpu_VecCopy_time = [] gpu_ToGpu = [] gpu_ToGpu_time = [] if operation == "VecDot": mem_scale = 1 elif operation == "VecAXPY": mem_scale = 1.5 for size in gpu_sizes: if operation == "VecDot": scale = 1 # operation time and floprate gpu_time.append( ut.get_time( "../data/waitforgpu/vec_ops.n6_g1_c2_a1." + str(size) + ".718559", operation, count)) gpu.append(scale * float( ut.get_floprate( "../data/waitforgpu/vec_ops.n6_g1_c2_a1." + str(size) + ".718559", operation, False, count))) elif operation == "VecAXPY": scale = 1 gpu_time.append( ut.get_time( "../data/figures-2-7-8-9/vec_ops.n6_g1_c2_a1." + str(size) + ".668627", operation, count)) gpu.append(scale * float( ut.get_floprate( "../data/figures-2-7-8-9/vec_ops.n6_g1_c2_a1." + str(size) + ".668627", operation, False, count))) # GPU copy time and bandwidth scale = 2 / mem_scale time = ut.get_time( "../data/figures-2-7-8-9/vec_ops.n6_g1_c2_a1." + str(size) + ".668627", "VecCopy", 1) gpu_VecCopy_time.append(time) gpu_VecCopy.append(scale * ut.calc_rate(size, time)) # GPU to CPU time and bandwidth, pinned memory scale = 1 / mem_scale if gpu_sizes <= 100000: run_num = ".732319" else: run_num = ".715071" time = ut.get_time( "../data/pinned/vec_ops.n6_g1_c7_a1." + str(size) + run_num, "VecCUDACopyTo", 1) gpu_ToGpu_time.append(time) gpu_ToGpu.append(scale * ut.calc_rate(size, time)) for size in cpu_sizes: scale = 1 # CPU operation time and bandwidth # cpu_time.append(ut.get_time("../data/vec-ops/vec_ops.n2_g0_c21_p42." + str(size) + ".654910", operation, count)) # cpu.append(scale*float(ut.get_floprate("../data/vec-ops/vec_ops.n2_g0_c21_p42." + str(size) + ".654910", operation, True, count))) cpu_time.append( ut.get_time( "../data/cpu-flush-cache/vec_ops.n2_g0_c21_p42." + str(size), operation, count)) cpu.append(scale * float( ut.get_floprate( "../data/cpu-flush-cache/vec_ops.n2_g0_c21_p42." + str(size), operation, True, count))) # CPU copy time and bandwidth scale = 2 / mem_scale # time = ut.get_time("../data/vec-ops/vec_ops.n2_g0_c21_p42." + str(size) + ".654910", "VecCopy", 1) time = ut.get_time( "../data/cpu-flush-cache/vec_ops.n2_g0_c21_p42." + str(size), "VecCopy", 1) cpu_VecCopy_time.append(time) cpu_VecCopy.append(scale * ut.calc_rate(size, time)) # calculate peak rates in 8 Mbyes/second cpu_rate = 135 * 1e9 gpu_rate = 900 * 1e9 cpu_peak = (2 * cpu_rate) / (8 * 1e6) gpu_peak = (6 * gpu_rate) / (8 * 1e6) # plot fig, left = plt.subplots() right = left.twinx() cm = plt.get_cmap('inferno') left.plot(cpu_time, cpu, color=cm((1. * 2) / 4), label="42 CPU cores " + operation) left.plot(cpu_VecCopy_time, cpu_VecCopy, color=cm((1. * 2) / 4), linestyle="dashed", label="42 CPU cores VecCopy") left.plot(gpu_ToGpu_time, gpu_ToGpu, color=cm((1. * 3) / 4), linestyle="dashed", label="6 GPUs copy to GPU") left.plot(gpu_time, gpu, color=cm((1. * 1 - 1) / 4), label="6 GPUs " + operation) left.plot(gpu_VecCopy_time, gpu_VecCopy, color=cm((1. * 1 - 1) / 4), linestyle="dashed", label="6 GPUs VecCopy") left.set_title("GPU vs CPU " + operation + " performance", fontsize=12) left.set_xlabel("Execution time (seconds)", fontsize=12) left.set_ylabel("MFlops/second", fontsize=12) right.set_ylabel("8 MBytes/second", fontsize=12) left.legend(loc="lower right", fontsize=12, ncol=1, frameon=False) # markerfirst=False left.set_xscale('log') left.set_yscale('log') right.set_yscale('log') left.set_ylim([7, 1000000]) right.set_ylim([7, 1000000 * mem_scale]) plt.xlim([1e-6, .2]) plt.savefig("../plots/jed_" + operation + "_CPU_vs_GPU.png") plt.tight_layout() if show: plt.show()
def cpu_vs_gpu(operation, count, clear, show): cpu_sizes = [1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000] cpu_flush = [] cpu_half_flush = [] cpu_no_flush = [] cpu_permute = [] cpu_flush_vecset = [] cpu_half_flush_december = [] cpu_VecCopy_flush = [] cpu_VecCopy_half_flush = [] cpu_VecCopy_no_flush = [] if operation == "VecDot": mem_scale = 1 elif operation == "VecAXPY": mem_scale = 1.5 for size in cpu_sizes: if operation == "VecDot": scale = 1 elif operation == "VecAXPY": scale = 1 cpu_flush.append(scale * float( ut.get_floprate( "../data/cpu-flush-cache/vec_ops.n2_g0_c21_p42." + str(size) + ".767597", operation, True, count))) cpu_half_flush.append(scale * float( ut.get_floprate( "../data/vec-ops/vec_ops.n2_g0_c21_p42." + str(size) + ".654910", operation, True, count))) cpu_half_flush_december.append(scale * float( ut.get_floprate( "../data/vec-ops-december/vec_ops.n2_g0_c21_p42." + str(size) + ".795805", operation, True, count))) cpu_no_flush.append(scale * float( ut.get_floprate( "../data/cpu-no-flush-cache/vec_ops.n2_g0_c21_p42." + str(size) + ".767590", operation, True, count))) cpu_flush_vecset.append(scale * float( ut.get_floprate( "../data/cpu-flush-cache-vecset/vec_ops.n2_g0_c21_p42." + str(size) + ".792547", operation, True, count))) cpu_permute.append(scale * float( ut.get_floprate( "../data/permute-operations/vec_ops.n2_g0_c21_p42." + str(size) + ".792549", operation, True, count))) scale = 2 / mem_scale # VecCopy time = ut.get_time( "../data/cpu-flush-cache/vec_ops.n2_g0_c21_p42." + str(size) + ".767597", "VecCopy", 1) cpu_VecCopy_flush.append(scale * ut.calc_rate(size, time)) time = ut.get_time( "../data/vec-ops/vec_ops.n2_g0_c21_p42." + str(size) + ".654910", "VecCopy", 1) cpu_VecCopy_half_flush.append(scale * ut.calc_rate(size, time)) time = ut.get_time( "../data/cpu-no-flush-cache/vec_ops.n2_g0_c21_p42." + str(size) + ".767590", "VecCopy", 1) cpu_VecCopy_no_flush.append(scale * ut.calc_rate(size, time)) print cpu_half_flush_december[0] # plot fig, left = plt.subplots() right = left.twinx() cm = plt.get_cmap('inferno') left.plot(cpu_sizes, cpu_flush, color=cm((1. * 2) / 4), label=operation + " cleared cache") left.plot(cpu_sizes, cpu_half_flush, color=cm((1. * 2) / 4), linestyle="dashed", label=operation + " half cleared cache") left.plot(cpu_sizes, cpu_half_flush_december, color="black", label=operation + " December") left.plot(cpu_sizes, cpu_no_flush, color=cm((1. * 2) / 4), linestyle="dotted", label=operation + " uncleared cache") # left.plot(cpu_sizes, cpu_permute, color="black", linestyle="dashed", label=operation+ " another cleared") # left.plot(cpu_sizes, cpu_permute, color="black", label=operation+ " rearrange operations") # left.plot(cpu_sizes, cpu_VecCopy_flush, color=cm((1.*1-1)/4), label="VecCopy cleared cache") # left.plot(cpu_sizes, cpu_VecCopy_half_flush, color=cm((1.*1-1)/4), linestyle="dashed", label="VecCopy half cleared cache") # left.plot(cpu_sizes, cpu_VecCopy_no_flush, color=cm((1.*1-1)/4), linestyle="dotted", label="VecCopy uncleared cache") plt.xlim([500, 2000000000]) left.set_title("CPU " + operation + " cache performance", fontsize=12) left.set_xlabel("Vector size", fontsize=12) left.set_ylabel("MFlops/second", fontsize=12) right.set_ylabel("8 MBytes/second", fontsize=12) left.legend(loc="lower right", fontsize=12, ncol=1, frameon=False) plt.legend(loc="upper left", fontsize=12, ncol=1, frameon=False) left.set_xscale('log') left.set_yscale('log') right.set_yscale('log') top_ = 1000000 left.set_ylim(top=top_) right.set_ylim(top=top_ * mem_scale) right.set_ylim(bottom=20) left.set_ylim(bottom=20) plt.tight_layout() # plt.savefig("../plots/" + operation + "_CPU_cleared_cache.png") if show: plt.show()
def VecCopy(show): gpu_sizes = [ 1000, 10000, 100000, 1000000, 2000000, 4000000, 6000000, 8000000, 10000000, 20000000, 40000000, 60000000, 80000000, 100000000, 1000000000 ] cpu_sizes = [1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000] cpu = [] gpu = [] cputogpu = [] scale = 2 # for VecCopy for size in cpu_sizes: time1 = ut.get_time( "../data/vec-ops/vec_ops.n2_g0_c21_p7." + str(size) + ".654910", "VecCopy", 1) cpu.append(ut.calc_rate(size, time1)) for size in gpu_sizes: scale = 2 # for VecCopy time2 = ut.get_time( "../data/figures-2-7-8-9/vec_ops.n1_g1_c2_a1." + str(size) + ".668627", "VecCopy", 1) gpu.append(scale * ut.calc_rate(size, time2)) scale = 1 # for copy to GPU nonpinned = ut.get_time( "../data/figures-2-7-8-9/vec_ops.n1_g1_c2_a1." + str(size) + ".668627", "VecCUDACopyTo", 1) pinned = ut.get_time( "../data/pinned/vec_ops.n1_g1_c42_a1." + str(size) + ".720947", "VecCUDACopyTo", 1) if pinned < nonpinned: time3 = pinned else: time3 = nonpinned cputogpu.append(scale * ut.calc_rate(size, time3)) # calculate peak rates in 8 Mbytes/s cpu_rate = 135 * 1e9 gpu_rate = 900 * 1e9 cpu_to_gpu_rate = 50 * 1e9 cpu_peak = (2 * cpu_rate) / (8 * 1e6) gpu_peak = gpu_rate / (8 * 1e6) cpu_to_gpu_peak = cpu_to_gpu_rate / (8 * 1e6) # plot num = 4 cm = plt.get_cmap('inferno') fig = plt.figure() ax = fig.add_subplot(111) ax.set_color_cycle([cm((1. * i) / num) for i in range(num)]) ax.plot(cpu_sizes, cpu, marker="o", markersize="4", markeredgewidth=2, label="7 CPU cores VecCopy") ax.plot(gpu_sizes, gpu, marker="o", markersize="4", markeredgewidth=2, label="1 GPU VecCopy") ax.plot(gpu_sizes, cputogpu, marker="o", markersize="4", markeredgewidth=2, label="1 GPU copy to GPU") plt.plot(2000000000, gpu_peak, color="black", linestyle="none", markersize="15", markeredgewidth=2, marker="_", clip_on=False) plt.plot(2000000000, cpu_peak, color="black", linestyle="none", markersize="15", markeredgewidth=2, marker="_", clip_on=False) plt.plot(2000000000, cpu_to_gpu_peak, color="black", linestyle="none", markersize="15", markeredgewidth=2, marker="_", clip_on=False) # print cputogpu[-1] # print cpu_to_gpu_peak plt.text(1200000000, gpu_peak, "GPU copy peak", horizontalalignment='right', verticalalignment='center') plt.text(1200000000, cpu_peak, "CPU copy peak", horizontalalignment='right', verticalalignment='center') plt.text(1600000000, cpu_to_gpu_peak - 6000, "CPU to GPU peak", horizontalalignment='right', verticalalignment='center') plt.xlim([500, 2000000000]) plt.title("VecCopy performance", fontsize=12) plt.xlabel("Vector size", fontsize=12) plt.ylabel("8 Mbytes/second", fontsize=12) plt.legend(loc="upper left", fontsize=12, frameon=False) plt.tight_layout() plt.xscale('log') ax.set_yticklabels( ['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()]) plt.savefig("../plots/VecCopy.png") if show: plt.show()