def config_hybrid_mem(options, system): """ Assign proper address ranges for DRAM and NVM controllers. Create memory controllers and add their shared bus to the system. """ system.thnvm_bus = VirtualXBar() mem_ctrls = [] # The default behaviour is to interleave memory channels on 128 # byte granularity, or cache line granularity if larger than 128 # byte. This value is based on the locality seen across a large # range of workloads. intlv_size = max(128, system.cache_line_size.value) total_size = Addr(options.mem_size) dram_size = pow(2, options.page_bits) * options.ptt_length if dram_size < total_size.value: nvm_cls = MemConfig.get(options.nvm_type) nvm_range = AddrRange(0, total_size - dram_size - 1) nvm_ctrl = MemConfig.create_mem_ctrl(nvm_cls, nvm_range, 0, 1, 0, intlv_size) # Set the number of ranks based on the command-line # options if it was explicitly set if issubclass(nvm_cls, DRAMCtrl) and options.mem_ranks: nvm_ctrl.ranks_per_channel = options.mem_ranks mem_ctrls.append(nvm_ctrl) if dram_size > 0: dram_cls = MemConfig.get(options.dram_type) dram_range = AddrRange(total_size - dram_size, total_size - 1) dram_ctrl = MemConfig.create_mem_ctrl(dram_cls, dram_range, 0, 1, 0, intlv_size) # Set the number of ranks based on the command-line # options if it was explicitly set if issubclass(dram_cls, DRAMCtrl) and options.mem_ranks: dram_ctrl.ranks_per_channel = options.mem_ranks mem_ctrls.append(dram_ctrl) system.mem_ctrls = mem_ctrls # Connect the controllers to the THNVM bus for i in xrange(len(system.mem_ctrls)): system.mem_ctrls[i].port = system.thnvm_bus.master system.thnvm_bus.slave = system.membus.master
def setup_memory_controllers(system, ruby, dir_cntrls, options): ruby.block_size_bytes = options.cacheline_size ruby.memory_size_bits = 48 block_size_bits = int(math.log(options.cacheline_size, 2)) if options.numa_high_bit: numa_bit = options.numa_high_bit else: # if the numa_bit is not specified, set the directory bits as the # lowest bits above the block offset bits, and the numa_bit as the # highest of those directory bits dir_bits = int(math.log(options.num_dirs, 2)) numa_bit = block_size_bits + dir_bits - 1 index = 0 mem_ctrls = [] crossbars = [] # Sets bits to be used for interleaving. Creates memory controllers # attached to a directory controller. A separate controller is created # for each address range as the abstract memory can handle only one # contiguous address range as of now. for dir_cntrl in dir_cntrls: dir_cntrl.directory.numa_high_bit = numa_bit crossbar = None if buildEnv['TARGET_ISA'] != "arm": if len(system.mem_ranges) > 1: crossbar = IOXBar() crossbars.append(crossbar) dir_cntrl.memory = crossbar.slave else: #connect to iobus crossbar dir_cntrl.memory = system.iobus.slave for r in system.mem_ranges: mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), r, index, options.num_dirs, int(math.log(options.num_dirs, 2)), options.cacheline_size) mem_ctrls.append(mem_ctrl) if buildEnv['TARGET_ISA'] != "arm": if crossbar != None: mem_ctrl.port = crossbar.master else: mem_ctrl.port = dir_cntrl.memory else: #ARM mem_ctrl.port = system.iobus.master index += 1 system.mem_ctrls = mem_ctrls if buildEnv['TARGET_ISA'] != "arm": if len(crossbars) > 0: ruby.crossbars = crossbars
def setup_memory_controllers(system, ruby, dir_cntrls, options): ruby.block_size_bytes = options.cacheline_size ruby.memory_size_bits = 48 block_size_bits = int(math.log(options.cacheline_size, 2)) if options.numa_high_bit: numa_bit = options.numa_high_bit else: # if the numa_bit is not specified, set the directory bits as the # lowest bits above the block offset bits, and the numa_bit as the # highest of those directory bits dir_bits = int(math.log(options.num_dirs, 2)) numa_bit = block_size_bits + dir_bits - 1 index = 0 mem_ctrls = [] crossbars = [] # Sets bits to be used for interleaving. Creates memory controllers # attached to a directory controller. A separate controller is created # for each address range as the abstract memory can handle only one # contiguous address range as of now. for dir_cntrl in dir_cntrls: dir_cntrl.directory.numa_high_bit = numa_bit crossbar = None if len(system.mem_ranges) > 1: crossbar = NoncoherentXBar() crossbars.append(crossbar) dir_cntrl.memory = crossbar.slave for r in system.mem_ranges: mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), r, index, options.num_dirs, int(math.log(options.num_dirs, 2)), options.cacheline_size) mem_ctrls.append(mem_ctrl) if crossbar != None: mem_ctrl.port = crossbar.master else: mem_ctrl.port = dir_cntrl.memory index += 1 system.mem_ctrls = mem_ctrls if len(crossbars) > 0: ruby.crossbars = crossbars
def setup_memory_controllers(system, ruby, dir_cntrls, options): ruby.block_size_bytes = options.cacheline_size ruby.memory_size_bits = 48 block_size_bits = int(math.log(options.cacheline_size, 2)) if options.numa_high_bit: numa_bit = options.numa_high_bit else: # if the numa_bit is not specified, set the directory bits as the # lowest bits above the block offset bits, and the numa_bit as the # highest of those directory bits dir_bits = int(math.log(options.num_dirs, 2)) numa_bit = block_size_bits + dir_bits - 1 index = 0 mem_ctrls = [] crossbars = [] # Sets bits to be used for interleaving. Creates memory controllers # attached to a directory controller. A separate controller is created # for each address range as the abstract memory can handle only one # contiguous address range as of now. for dir_cntrl in dir_cntrls: # Create 1 instance of DRAMCache per directory controller if options.dramcache: dramcache_ctrl = MemConfig.create_dramcache_ctrl( MemConfig.get_cache(options.dramcache_type), system.mem_ranges[0], index, options.num_dirs, options.dramcache_size, options.dramcache_assoc, options.dramcache_block_size, options.num_cpus, options.dramcache_timing) mem_ctrls.append(dramcache_ctrl) dir_cntrl.memory = dramcache_ctrl.port dir_cntrl.directory.numa_high_bit = numa_bit crossbar = None if len(system.mem_ranges) > 1: # we dont support this fatal("system mem_ranges greater than 1") crossbar = IOXBar() crossbars.append(crossbar) if options.dramcache: dramcache_ctrl.dramcache_masterport = crossbar.slave else: dir_cntrl.memory = crossbar.slave for r in system.mem_ranges: # if dramcache exists interleave at dramcache_block_size if options.dramcache: mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), r, index, options.num_dirs, int(math.log(options.num_dirs, 2)), options.dramcache_block_size) else: mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), r, index, options.num_dirs, int(math.log(options.num_dirs, 2)), options.cacheline_size) mem_ctrls.append(mem_ctrl) if crossbar != None: mem_ctrl.port = crossbar.master else: if options.dramcache: mem_ctrl.port = dramcache_ctrl.dramcache_masterport else: mem_ctrl.port = dir_cntrl.memory index += 1 system.mem_ctrls = mem_ctrls if len(crossbars) > 0: ruby.crossbars = crossbars
def create_system(options, full_system, system, dma_devices, ruby_system): if not buildEnv['GPGPU_SIM']: m5.util.panic( "This script requires GPGPU-Sim integration to be built.") # Run the protocol script to setup CPU cluster, directory and DMA (all_sequencers, dir_cntrls, dma_cntrls, cpu_cluster) = \ VI_hammer.create_system(options, full_system, system, dma_devices, ruby_system) # If we're going to split the directories/memory controllers if options.num_dev_dirs > 0: cpu_cntrl_count = len(cpu_cluster) else: cpu_cntrl_count = len(cpu_cluster) + len(dir_cntrls) # # Create controller for the copy engine to connect to in CPU cluster # Cache is unused by controller # cache = L1Cache(size="4096B", assoc=2) cpu_ce_seq = RubySequencer(version=options.num_cpus + options.num_sc, icache=cache, dcache=cache, max_outstanding_requests=64, ruby_system=ruby_system, connect_to_io=False) cpu_ce_cntrl = GPUCopyDMA_Controller(version=0, sequencer=cpu_ce_seq, number_of_TBEs=256, transitions_per_cycle=options.ports, ruby_system=ruby_system) cpu_ce_cntrl.responseFromDir = MessageBuffer(ordered=True) cpu_ce_cntrl.responseFromDir.slave = ruby_system.network.master cpu_ce_cntrl.reqToDirectory = MessageBuffer(ordered=True) cpu_ce_cntrl.reqToDirectory.master = ruby_system.network.slave cpu_ce_cntrl.mandatoryQueue = MessageBuffer() ruby_system.ce_cntrl = cpu_ce_cntrl cpu_cntrl_count += 1 # # Build GPU cluster # gpu_cluster = Cluster(intBW=32, extBW=32) gpu_cluster.disableConnectToParent() l2_bits = int(math.log(options.num_l2caches, 2)) block_size_bits = int(math.log(options.cacheline_size, 2)) # This represents the L1 to L2 interconnect latency # NOTE! This latency is in Ruby (cache) cycles, not SM cycles per_hop_interconnect_latency = 45 # ~15 GPU cycles num_dance_hall_hops = int(math.log(options.num_sc, 2)) if num_dance_hall_hops == 0: num_dance_hall_hops = 1 l1_to_l2_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops # # Caches for GPU cores # for i in xrange(options.num_sc): # # First create the Ruby objects associated with the GPU cores # cache = L1Cache(size=options.sc_l1_size, assoc=options.sc_l1_assoc, replacement_policy=LRUReplacementPolicy(), start_index_bit=block_size_bits, dataArrayBanks=4, tagArrayBanks=4, dataAccessLatency=4, tagAccessLatency=4, resourceStalls=False) l1_cntrl = GPUL1Cache_Controller( version=i, cache=cache, l2_select_num_bits=l2_bits, num_l2=options.num_l2caches, transitions_per_cycle=options.ports, issue_latency=l1_to_l2_noc_latency, number_of_TBEs=options.gpu_l1_buf_depth, ruby_system=ruby_system) gpu_seq = RubySequencer( version=options.num_cpus + i, icache=cache, dcache=cache, max_outstanding_requests=options.gpu_l1_buf_depth, ruby_system=ruby_system, deadlock_threshold=2000000, connect_to_io=False) l1_cntrl.sequencer = gpu_seq exec("ruby_system.l1_cntrl_sp%02d = l1_cntrl" % i) # # Add controllers and sequencers to the appropriate lists # all_sequencers.append(gpu_seq) gpu_cluster.add(l1_cntrl) # Connect the controller to the network l1_cntrl.requestFromL1Cache = MessageBuffer(ordered=True) l1_cntrl.requestFromL1Cache.master = ruby_system.network.slave l1_cntrl.responseToL1Cache = MessageBuffer(ordered=True) l1_cntrl.responseToL1Cache.slave = ruby_system.network.master l1_cntrl.mandatoryQueue = MessageBuffer() l2_index_start = block_size_bits + l2_bits # Use L2 cache and interconnect latencies to calculate protocol latencies # NOTE! These latencies are in Ruby (cache) cycles, not SM cycles l2_cache_access_latency = 30 # ~10 GPU cycles l2_to_l1_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops l2_to_mem_noc_latency = 125 # ~40 GPU cycles l2_clusters = [] for i in xrange(options.num_l2caches): # # First create the Ruby objects associated with this cpu # l2_cache = L2Cache(size=options.sc_l2_size, assoc=options.sc_l2_assoc, start_index_bit=l2_index_start, replacement_policy=LRUReplacementPolicy(), dataArrayBanks=4, tagArrayBanks=4, dataAccessLatency=4, tagAccessLatency=4, resourceStalls=options.gpu_l2_resource_stalls) l2_cntrl = GPUL2Cache_Controller( version=i, L2cache=l2_cache, transitions_per_cycle=options.ports, l2_response_latency=l2_cache_access_latency + l2_to_l1_noc_latency, l2_request_latency=l2_to_mem_noc_latency, cache_response_latency=l2_cache_access_latency, ruby_system=ruby_system) exec("ruby_system.l2_cntrl%d = l2_cntrl" % i) l2_cluster = Cluster(intBW=32, extBW=32) l2_cluster.add(l2_cntrl) gpu_cluster.add(l2_cluster) l2_clusters.append(l2_cluster) # Connect the controller to the network l2_cntrl.responseToL1Cache = MessageBuffer(ordered=True) l2_cntrl.responseToL1Cache.master = ruby_system.network.slave l2_cntrl.requestFromCache = MessageBuffer() l2_cntrl.requestFromCache.master = ruby_system.network.slave l2_cntrl.responseFromCache = MessageBuffer() l2_cntrl.responseFromCache.master = ruby_system.network.slave l2_cntrl.unblockFromCache = MessageBuffer() l2_cntrl.unblockFromCache.master = ruby_system.network.slave l2_cntrl.requestFromL1Cache = MessageBuffer(ordered=True) l2_cntrl.requestFromL1Cache.slave = ruby_system.network.master l2_cntrl.forwardToCache = MessageBuffer() l2_cntrl.forwardToCache.slave = ruby_system.network.master l2_cntrl.responseToCache = MessageBuffer() l2_cntrl.responseToCache.slave = ruby_system.network.master l2_cntrl.triggerQueue = MessageBuffer() gpu_phys_mem_size = system.gpu.gpu_memory_range.size() if options.num_dev_dirs > 0: mem_module_size = gpu_phys_mem_size / options.num_dev_dirs # # determine size and index bits for probe filter # By default, the probe filter size is configured to be twice the # size of the L2 cache. # pf_size = MemorySize(options.sc_l2_size) pf_size.value = pf_size.value * 2 dir_bits = int(math.log(options.num_dev_dirs, 2)) pf_bits = int(math.log(pf_size.value, 2)) if options.numa_high_bit: if options.pf_on or options.dir_on: # if numa high bit explicitly set, make sure it does not overlap # with the probe filter index assert (options.numa_high_bit - dir_bits > pf_bits) # set the probe filter start bit to just above the block offset pf_start_bit = block_size_bits else: if dir_bits > 0: pf_start_bit = dir_bits + block_size_bits - 1 else: pf_start_bit = block_size_bits dev_dir_cntrls = [] dev_mem_ctrls = [] num_cpu_dirs = len(dir_cntrls) for i in xrange(options.num_dev_dirs): # # Create the Ruby objects associated with the directory controller # dir_version = i + num_cpu_dirs dir_size = MemorySize('0B') dir_size.value = mem_module_size pf = ProbeFilter(size=pf_size, assoc=4, start_index_bit=pf_start_bit) dev_dir_cntrl = Directory_Controller(version = dir_version, directory = \ RubyDirectoryMemory( \ version = dir_version, size = dir_size, numa_high_bit = \ options.numa_high_bit, device_directory = True), probeFilter = pf, probe_filter_enabled = options.pf_on, full_bit_dir_enabled = options.dir_on, transitions_per_cycle = options.ports, ruby_system = ruby_system) if options.recycle_latency: dev_dir_cntrl.recycle_latency = options.recycle_latency exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i) dev_dir_cntrls.append(dev_dir_cntrl) # Connect the directory controller to the network dev_dir_cntrl.forwardFromDir = MessageBuffer() dev_dir_cntrl.forwardFromDir.master = ruby_system.network.slave dev_dir_cntrl.responseFromDir = MessageBuffer() dev_dir_cntrl.responseFromDir.master = ruby_system.network.slave dev_dir_cntrl.dmaResponseFromDir = MessageBuffer(ordered=True) dev_dir_cntrl.dmaResponseFromDir.master = ruby_system.network.slave dev_dir_cntrl.unblockToDir = MessageBuffer() dev_dir_cntrl.unblockToDir.slave = ruby_system.network.master dev_dir_cntrl.responseToDir = MessageBuffer() dev_dir_cntrl.responseToDir.slave = ruby_system.network.master dev_dir_cntrl.requestToDir = MessageBuffer() dev_dir_cntrl.requestToDir.slave = ruby_system.network.master dev_dir_cntrl.dmaRequestToDir = MessageBuffer(ordered=True) dev_dir_cntrl.dmaRequestToDir.slave = ruby_system.network.master dev_dir_cntrl.triggerQueue = MessageBuffer(ordered=True) dev_dir_cntrl.responseFromMemory = MessageBuffer() dev_mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), system.gpu.gpu_memory_range, i, options.num_dev_dirs, int(math.log(options.num_dev_dirs, 2)), options.cacheline_size) dev_mem_ctrl.port = dev_dir_cntrl.memory dev_mem_ctrls.append(dev_mem_ctrl) system.dev_mem_ctrls = dev_mem_ctrls else: # Since there are no device directories, use CPU directories # Fix up the memory sizes of the CPU directories num_dirs = len(dir_cntrls) add_gpu_mem = gpu_phys_mem_size / num_dirs for cntrl in dir_cntrls: new_size = cntrl.directory.size.value + add_gpu_mem cntrl.directory.size.value = new_size # # Create controller for the copy engine to connect to in GPU cluster # Cache is unused by controller # cache = L1Cache(size="4096B", assoc=2) gpu_ce_seq = RubySequencer(version=options.num_cpus + options.num_sc + 1, icache=cache, dcache=cache, max_outstanding_requests=64, support_inst_reqs=False, ruby_system=ruby_system, connect_to_io=False) gpu_ce_cntrl = GPUCopyDMA_Controller(version=1, sequencer=gpu_ce_seq, number_of_TBEs=256, transitions_per_cycle=options.ports, ruby_system=ruby_system) ruby_system.dev_ce_cntrl = gpu_ce_cntrl all_sequencers.append(cpu_ce_seq) all_sequencers.append(gpu_ce_seq) gpu_ce_cntrl.responseFromDir = MessageBuffer(ordered=True) gpu_ce_cntrl.responseFromDir.slave = ruby_system.network.master gpu_ce_cntrl.reqToDirectory = MessageBuffer(ordered=True) gpu_ce_cntrl.reqToDirectory.master = ruby_system.network.slave gpu_ce_cntrl.mandatoryQueue = MessageBuffer() complete_cluster = Cluster(intBW=32, extBW=32) complete_cluster.add(cpu_ce_cntrl) complete_cluster.add(gpu_ce_cntrl) complete_cluster.add(cpu_cluster) complete_cluster.add(gpu_cluster) for cntrl in dir_cntrls: complete_cluster.add(cntrl) for cntrl in dev_dir_cntrls: complete_cluster.add(cntrl) for cntrl in dma_cntrls: complete_cluster.add(cntrl) for cluster in l2_clusters: complete_cluster.add(cluster) return (all_sequencers, dir_cntrls, complete_cluster)
def create_system(options, full_system, system, dma_devices, ruby_system): if not buildEnv['GPGPU_SIM']: m5.util.panic("This script requires GPGPU-Sim integration to be built.") # Run the protocol script to setup CPU cluster, directory and DMA (all_sequencers, dir_cntrls, dma_cntrls, cpu_cluster) = \ VI_hammer.create_system(options, full_system, system, dma_devices, ruby_system) # If we're going to split the directories/memory controllers if options.num_dev_dirs > 0: cpu_cntrl_count = len(cpu_cluster) else: cpu_cntrl_count = len(cpu_cluster) + len(dir_cntrls) # # Create controller for the copy engine to connect to in CPU cluster # Cache is unused by controller # cache = L1Cache(size = "4096B", assoc = 2) cpu_ce_seq = RubySequencer(version = options.num_cpus + options.num_sc, icache = cache, dcache = cache, max_outstanding_requests = 64, ruby_system = ruby_system, connect_to_io = False) cpu_ce_cntrl = GPUCopyDMA_Controller(version = 0, sequencer = cpu_ce_seq, number_of_TBEs = 256, ruby_system = ruby_system) cpu_cntrl_count += 1 cpu_ce_cntrl.responseFromDir = ruby_system.network.master cpu_ce_cntrl.reqToDirectory = ruby_system.network.slave # # Build GPU cluster # gpu_cluster = Cluster(intBW = 32, extBW = 32) gpu_cluster.disableConnectToParent() l2_bits = int(math.log(options.num_l2caches, 2)) block_size_bits = int(math.log(options.cacheline_size, 2)) # This represents the L1 to L2 interconnect latency # NOTE! This latency is in Ruby (cache) cycles, not SM cycles per_hop_interconnect_latency = 45 # ~15 GPU cycles num_dance_hall_hops = int(math.log(options.num_sc, 2)) if num_dance_hall_hops == 0: num_dance_hall_hops = 1 l1_to_l2_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops # # Caches for GPU cores # for i in xrange(options.num_sc): # # First create the Ruby objects associated with the GPU cores # cache = L1Cache(size = options.sc_l1_size, assoc = options.sc_l1_assoc, replacement_policy = "LRU", start_index_bit = block_size_bits, dataArrayBanks = 4, tagArrayBanks = 4, dataAccessLatency = 4, tagAccessLatency = 4, resourceStalls = False) l1_cntrl = GPUL1Cache_Controller(version = i, cache = cache, l2_select_num_bits = l2_bits, num_l2 = options.num_l2caches, issue_latency = l1_to_l2_noc_latency, number_of_TBEs = options.gpu_l1_buf_depth, ruby_system = ruby_system) gpu_seq = RubySequencer(version = options.num_cpus + i, icache = cache, dcache = cache, max_outstanding_requests = options.gpu_l1_buf_depth, ruby_system = ruby_system, deadlock_threshold = 2000000, connect_to_io = False) l1_cntrl.sequencer = gpu_seq exec("ruby_system.l1_cntrl_sp%02d = l1_cntrl" % i) # # Add controllers and sequencers to the appropriate lists # all_sequencers.append(gpu_seq) gpu_cluster.add(l1_cntrl) # Connect the controller to the network l1_cntrl.requestFromL1Cache = ruby_system.network.slave l1_cntrl.responseToL1Cache = ruby_system.network.master l2_index_start = block_size_bits + l2_bits # Use L2 cache and interconnect latencies to calculate protocol latencies # NOTE! These latencies are in Ruby (cache) cycles, not SM cycles l2_cache_access_latency = 30 # ~10 GPU cycles l2_to_l1_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops l2_to_mem_noc_latency = 125 # ~40 GPU cycles l2_clusters = [] for i in xrange(options.num_l2caches): # # First create the Ruby objects associated with this cpu # l2_cache = L2Cache(size = options.sc_l2_size, assoc = options.sc_l2_assoc, start_index_bit = l2_index_start, replacement_policy = "LRU", dataArrayBanks = 4, tagArrayBanks = 4, dataAccessLatency = 4, tagAccessLatency = 4, resourceStalls = options.gpu_l2_resource_stalls) region_buffer = regionBuffer_Obj(size = "8MB", assoc = 2^16, start_index_bit = l2_index_start, replacement_policy = "LRU", dataArrayBanks = 4, tagArrayBanks = 4, dataAccessLatency = 4, tagAccessLatency = 4, resourceStalls = options.gpu_l2_resource_stalls, regionSize = options.region_size) l2_cntrl = GPUL2Cache_Controller(version = i, L2cache = l2_cache, regionBuffer = region_buffer, l2_response_latency = l2_cache_access_latency + l2_to_l1_noc_latency, l2_request_latency = l2_to_mem_noc_latency, cache_response_latency = l2_cache_access_latency, ruby_system = ruby_system) exec("ruby_system.l2_cntrl%d = l2_cntrl" % i) l2_cluster = Cluster(intBW = 32, extBW = 32) l2_cluster.add(l2_cntrl) gpu_cluster.add(l2_cluster) l2_clusters.append(l2_cluster) # Connect the controller to the network l2_cntrl.responseToL1Cache = ruby_system.network.slave l2_cntrl.requestFromCache = ruby_system.network.slave l2_cntrl.responseFromCache = ruby_system.network.slave l2_cntrl.unblockFromCache = ruby_system.network.slave l2_cntrl.requestFromL1Cache = ruby_system.network.master l2_cntrl.forwardToCache = ruby_system.network.master l2_cntrl.responseToCache = ruby_system.network.master gpu_phys_mem_size = system.gpu.gpu_memory_range.size() if options.num_dev_dirs > 0: mem_module_size = gpu_phys_mem_size / options.num_dev_dirs # # determine size and index bits for probe filter # By default, the probe filter size is configured to be twice the # size of the L2 cache. # pf_size = MemorySize(options.sc_l2_size) pf_size.value = pf_size.value * 2 dir_bits = int(math.log(options.num_dev_dirs, 2)) pf_bits = int(math.log(pf_size.value, 2)) if options.numa_high_bit: if options.pf_on or options.dir_on: # if numa high bit explicitly set, make sure it does not overlap # with the probe filter index assert(options.numa_high_bit - dir_bits > pf_bits) # set the probe filter start bit to just above the block offset pf_start_bit = block_size_bits else: if dir_bits > 0: pf_start_bit = dir_bits + block_size_bits - 1 else: pf_start_bit = block_size_bits dev_dir_cntrls = [] dev_mem_ctrls = [] num_cpu_dirs = len(dir_cntrls) for i in xrange(options.num_dev_dirs): # # Create the Ruby objects associated with the directory controller # dir_version = i + num_cpu_dirs dir_size = MemorySize('0B') dir_size.value = mem_module_size pf = ProbeFilter(size = pf_size, assoc = 4, start_index_bit = pf_start_bit) dev_dir_cntrl = Directory_Controller(version = dir_version, directory = \ RubyDirectoryMemory( \ version = dir_version, size = dir_size, numa_high_bit = \ options.numa_high_bit, device_directory = True), probeFilter = pf, probe_filter_enabled = options.pf_on, full_bit_dir_enabled = options.dir_on, ruby_system = ruby_system) if options.recycle_latency: dev_dir_cntrl.recycle_latency = options.recycle_latency exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i) dev_dir_cntrls.append(dev_dir_cntrl) # Connect the directory controller to the network dev_dir_cntrl.forwardFromDir = ruby_system.network.slave dev_dir_cntrl.responseFromDir = ruby_system.network.slave dev_dir_cntrl.dmaResponseFromDir = ruby_system.network.slave dev_dir_cntrl.unblockToDir = ruby_system.network.master dev_dir_cntrl.responseToDir = ruby_system.network.master dev_dir_cntrl.requestToDir = ruby_system.network.master dev_dir_cntrl.dmaRequestToDir = ruby_system.network.master dev_mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), system.gpu.gpu_memory_range, i, options.num_dev_dirs, int(math.log(options.num_dev_dirs, 2)), options.cacheline_size) dev_mem_ctrl.port = dev_dir_cntrl.memory dev_mem_ctrls.append(dev_mem_ctrl) system.dev_mem_ctrls = dev_mem_ctrls else: # Since there are no device directories, use CPU directories # Fix up the memory sizes of the CPU directories num_dirs = len(dir_cntrls) add_gpu_mem = gpu_phys_mem_size / num_dirs for cntrl in dir_cntrls: new_size = cntrl.directory.size.value + add_gpu_mem cntrl.directory.size.value = new_size # # Create controller for the copy engine to connect to in GPU cluster # Cache is unused by controller # cache = L1Cache(size = "4096B", assoc = 2) gpu_ce_seq = RubySequencer(version = options.num_cpus + options.num_sc + 1, icache = cache, dcache = cache, max_outstanding_requests = 64, support_inst_reqs = False, ruby_system = ruby_system, connect_to_io = False) gpu_ce_cntrl = GPUCopyDMA_Controller(version = 1, sequencer = gpu_ce_seq, number_of_TBEs = 256, ruby_system = ruby_system) ruby_system.l1_cntrl_ce = gpu_ce_cntrl all_sequencers.append(cpu_ce_seq) all_sequencers.append(gpu_ce_seq) gpu_ce_cntrl.responseFromDir = ruby_system.network.master gpu_ce_cntrl.reqToDirectory = ruby_system.network.slave complete_cluster = Cluster(intBW = 32, extBW = 32) complete_cluster.add(cpu_ce_cntrl) complete_cluster.add(gpu_ce_cntrl) complete_cluster.add(cpu_cluster) complete_cluster.add(gpu_cluster) for cntrl in dir_cntrls: complete_cluster.add(cntrl) for cntrl in dev_dir_cntrls: complete_cluster.add(cntrl) for cntrl in dma_cntrls: complete_cluster.add(cntrl) for cluster in l2_clusters: complete_cluster.add(cluster) return (all_sequencers, dir_cntrls, complete_cluster)
def create_system(options, full_system, system, dma_ports, ruby_system): if not buildEnv['GPGPU_SIM']: m5.util.panic("This script requires GPGPU-Sim integration to be built.") options.access_backing_store = True # Run the original protocol script buildEnv['PROTOCOL'] = buildEnv['PROTOCOL'].replace('split', 'fusion') protocol = buildEnv['PROTOCOL'] exec "import %s" % protocol try: (cpu_sequencers, dir_cntrl_nodes, topology) = \ eval("%s.create_system(options, full_system, system, dma_ports, ruby_system)" % protocol) except: print "Error: could not create system for ruby protocol inside fusion system %s" % protocol raise # Faking things to build the rest of the system print "Warning!" print "Warning: Faking split MOESI_hammer protocol; collecting checkpoints?" print "Warning!" if options.num_dev_dirs > 0: block_size_bits = int(math.log(options.cacheline_size, 2)) gpu_phys_mem_size = system.gpu.gpu_memory_range.size() mem_module_size = gpu_phys_mem_size / options.num_dev_dirs # # determine size and index bits for probe filter # By default, the probe filter size is configured to be twice the # size of the L2 cache. # pf_size = MemorySize(options.sc_l2_size) pf_size.value = pf_size.value * 2 dir_bits = int(math.log(options.num_dev_dirs, 2)) pf_bits = int(math.log(pf_size.value, 2)) if options.numa_high_bit: if options.pf_on or options.dir_on: # if numa high bit explicitly set, make sure it does not overlap # with the probe filter index assert(options.numa_high_bit - dir_bits > pf_bits) # set the probe filter start bit to just above the block offset pf_start_bit = block_size_bits else: if dir_bits > 0: pf_start_bit = dir_bits + block_size_bits - 1 else: pf_start_bit = block_size_bits dev_dir_cntrls = [] dev_mem_ctrls = [] num_cpu_dirs = len(dir_cntrl_nodes) for i in xrange(options.num_dev_dirs): # # Create the Ruby objects associated with the directory controller # dir_version = i + num_cpu_dirs dir_size = MemorySize('0B') dir_size.value = mem_module_size pf = ProbeFilter(size = pf_size, assoc = 4, start_index_bit = pf_start_bit) dev_dir_cntrl = Directory_Controller(version = dir_version, directory = \ RubyDirectoryMemory( \ version = dir_version, size = dir_size, numa_high_bit = \ options.numa_high_bit, device_directory = True), probeFilter = pf, probe_filter_enabled = options.pf_on, full_bit_dir_enabled = options.dir_on, ruby_system = ruby_system) if options.recycle_latency: dev_dir_cntrl.recycle_latency = options.recycle_latency exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i) dev_dir_cntrls.append(dev_dir_cntrl) # Connect the directory controller to the network dev_dir_cntrl.forwardFromDir = ruby_system.network.slave dev_dir_cntrl.responseFromDir = ruby_system.network.slave dev_dir_cntrl.dmaResponseFromDir = ruby_system.network.slave dev_dir_cntrl.unblockToDir = ruby_system.network.master dev_dir_cntrl.responseToDir = ruby_system.network.master dev_dir_cntrl.requestToDir = ruby_system.network.master dev_dir_cntrl.dmaRequestToDir = ruby_system.network.master dev_mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), system.gpu.gpu_memory_range, i, options.num_dev_dirs, int(math.log(options.num_dev_dirs, 2)), options.cacheline_size) dev_mem_ctrl.port = dev_dir_cntrl.memory dev_mem_ctrls.append(dev_mem_ctrl) topology.addController(dev_dir_cntrl) system.dev_mem_ctrls = dev_mem_ctrls # # Create controller for the copy engine to connect to in GPU cluster # Cache is unused by controller # block_size_bits = int(math.log(options.cacheline_size, 2)) l1i_cache = L1Cache(size = "2kB", assoc = 2) l1d_cache = L1Cache(size = "2kB", assoc = 2) l2_cache = L2Cache(size = "2kB", assoc = 2, start_index_bit = block_size_bits) l1_cntrl = L1Cache_Controller(version = options.num_cpus + options.num_sc, L1Icache = l1i_cache, L1Dcache = l1d_cache, L2cache = l2_cache, no_mig_atomic = not \ options.allow_atomic_migration, send_evictions = ( options.cpu_type == "detailed"), ruby_system = ruby_system) gpu_ce_seq = RubySequencer(version = options.num_cpus + options.num_sc, icache = l1i_cache, dcache = l1d_cache, max_outstanding_requests = 64, ruby_system = ruby_system, connect_to_io = False) l1_cntrl.sequencer = gpu_ce_seq ruby_system.l1_cntrl_gpuce = l1_cntrl cpu_sequencers.append(gpu_ce_seq) topology.addController(l1_cntrl) # Connect the L1 controller and the network # Connect the buffers from the controller to network l1_cntrl.requestFromCache = ruby_system.network.slave l1_cntrl.responseFromCache = ruby_system.network.slave l1_cntrl.unblockFromCache = ruby_system.network.slave # Connect the buffers from the network to the controller l1_cntrl.forwardToCache = ruby_system.network.master l1_cntrl.responseToCache = ruby_system.network.master return (cpu_sequencers, dir_cntrl_nodes, topology)
def create_system(options, full_system, system, dma_ports, ruby_system): if not buildEnv['GPGPU_SIM']: m5.util.panic( "This script requires GPGPU-Sim integration to be built.") options.access_backing_store = True # Run the original protocol script buildEnv['PROTOCOL'] = buildEnv['PROTOCOL'].replace('split', 'fusion') protocol = buildEnv['PROTOCOL'] exec "import %s" % protocol try: (cpu_sequencers, dir_cntrl_nodes, topology) = \ eval("%s.create_system(options, full_system, system, dma_ports, ruby_system)" % protocol) except: print "Error: could not create system for ruby protocol inside fusion system %s" % protocol raise # Faking things to build the rest of the system print "Warning!" print "Warning: Faking split MOESI_hammer protocol; collecting checkpoints?" print "Warning!" if options.num_dev_dirs > 0: block_size_bits = int(math.log(options.cacheline_size, 2)) gpu_phys_mem_size = system.gpu.gpu_memory_range.size() mem_module_size = gpu_phys_mem_size / options.num_dev_dirs # # determine size and index bits for probe filter # By default, the probe filter size is configured to be twice the # size of the L2 cache. # pf_size = MemorySize(options.sc_l2_size) pf_size.value = pf_size.value * 2 dir_bits = int(math.log(options.num_dev_dirs, 2)) pf_bits = int(math.log(pf_size.value, 2)) if options.numa_high_bit: if options.pf_on or options.dir_on: # if numa high bit explicitly set, make sure it does not overlap # with the probe filter index assert (options.numa_high_bit - dir_bits > pf_bits) # set the probe filter start bit to just above the block offset pf_start_bit = block_size_bits else: if dir_bits > 0: pf_start_bit = dir_bits + block_size_bits - 1 else: pf_start_bit = block_size_bits dev_dir_cntrls = [] dev_mem_ctrls = [] num_cpu_dirs = len(dir_cntrl_nodes) for i in xrange(options.num_dev_dirs): # # Create the Ruby objects associated with the directory controller # dir_version = i + num_cpu_dirs dir_size = MemorySize('0B') dir_size.value = mem_module_size pf = ProbeFilter(size=pf_size, assoc=4, start_index_bit=pf_start_bit) dev_dir_cntrl = Directory_Controller(version = dir_version, directory = \ RubyDirectoryMemory( \ version = dir_version, size = dir_size, numa_high_bit = \ options.numa_high_bit, device_directory = True), probeFilter = pf, probe_filter_enabled = options.pf_on, full_bit_dir_enabled = options.dir_on, transitions_per_cycle = options.ports, ruby_system = ruby_system) if options.recycle_latency: dev_dir_cntrl.recycle_latency = options.recycle_latency exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i) dev_dir_cntrls.append(dev_dir_cntrl) # Connect the directory controller to the network dev_dir_cntrl.forwardFromDir = MessageBuffer() dev_dir_cntrl.forwardFromDir.master = ruby_system.network.slave dev_dir_cntrl.responseFromDir = MessageBuffer() dev_dir_cntrl.responseFromDir.master = ruby_system.network.slave dev_dir_cntrl.dmaResponseFromDir = MessageBuffer(ordered=True) dev_dir_cntrl.dmaResponseFromDir.master = ruby_system.network.slave dev_dir_cntrl.triggerQueue = MessageBuffer(ordered=True) dev_dir_cntrl.unblockToDir = MessageBuffer() dev_dir_cntrl.unblockToDir.slave = ruby_system.network.master dev_dir_cntrl.responseToDir = MessageBuffer() dev_dir_cntrl.responseToDir.slave = ruby_system.network.master dev_dir_cntrl.requestToDir = MessageBuffer() dev_dir_cntrl.requestToDir.slave = ruby_system.network.master dev_dir_cntrl.dmaRequestToDir = MessageBuffer(ordered=True) dev_dir_cntrl.dmaRequestToDir.slave = ruby_system.network.master dev_dir_cntrl.responseFromMemory = MessageBuffer() dev_mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), system.gpu.gpu_memory_range, i, options.num_dev_dirs, int(math.log(options.num_dev_dirs, 2)), options.cacheline_size) dev_mem_ctrl.port = dev_dir_cntrl.memory dev_mem_ctrls.append(dev_mem_ctrl) topology.addController(dev_dir_cntrl) system.dev_mem_ctrls = dev_mem_ctrls # # Create controller for the copy engine to connect to in GPU cluster # Cache is unused by controller # block_size_bits = int(math.log(options.cacheline_size, 2)) l1i_cache = L1Cache(size="2kB", assoc=2) l1d_cache = L1Cache(size="2kB", assoc=2) l2_cache = L2Cache(size="2kB", assoc=2, start_index_bit=block_size_bits) l1_cntrl = L1Cache_Controller(version = options.num_cpus + options.num_sc, L1Icache = l1i_cache, L1Dcache = l1d_cache, L2cache = l2_cache, no_mig_atomic = not \ options.allow_atomic_migration, send_evictions = False, transitions_per_cycle = options.ports, ruby_system = ruby_system) gpu_ce_seq = RubySequencer(version=options.num_cpus + options.num_sc, icache=l1i_cache, dcache=l1d_cache, max_outstanding_requests=64, ruby_system=ruby_system, connect_to_io=False) l1_cntrl.sequencer = gpu_ce_seq ruby_system.dev_ce_cntrl = l1_cntrl cpu_sequencers.append(gpu_ce_seq) topology.addController(l1_cntrl) # Connect the L1 controller and the network # Connect the buffers from the controller to network l1_cntrl.requestFromCache = MessageBuffer() l1_cntrl.requestFromCache.master = ruby_system.network.slave l1_cntrl.responseFromCache = MessageBuffer() l1_cntrl.responseFromCache.master = ruby_system.network.slave l1_cntrl.unblockFromCache = MessageBuffer() l1_cntrl.unblockFromCache.master = ruby_system.network.slave l1_cntrl.triggerQueue = MessageBuffer() # Connect the buffers from the network to the controller l1_cntrl.mandatoryQueue = MessageBuffer() l1_cntrl.forwardToCache = MessageBuffer() l1_cntrl.forwardToCache.slave = ruby_system.network.master l1_cntrl.responseToCache = MessageBuffer() l1_cntrl.responseToCache.slave = ruby_system.network.master return (cpu_sequencers, dir_cntrl_nodes, topology)