def config_hybrid_mem(options, system): """ Assign proper address ranges for DRAM and NVM controllers. Create memory controllers and add their shared bus to the system. """ system.thnvm_bus = VirtualXBar() mem_ctrls = [] # The default behaviour is to interleave memory channels on 128 # byte granularity, or cache line granularity if larger than 128 # byte. This value is based on the locality seen across a large # range of workloads. intlv_size = max(128, system.cache_line_size.value) total_size = Addr(options.mem_size) dram_size = pow(2, options.page_bits) * options.ptt_length if dram_size < total_size.value: nvm_cls = MemConfig.get(options.nvm_type) nvm_range = AddrRange(0, total_size - dram_size - 1) nvm_ctrl = MemConfig.create_mem_ctrl(nvm_cls, nvm_range, 0, 1, 0, intlv_size) # Set the number of ranks based on the command-line # options if it was explicitly set if issubclass(nvm_cls, DRAMCtrl) and options.mem_ranks: nvm_ctrl.ranks_per_channel = options.mem_ranks mem_ctrls.append(nvm_ctrl) if dram_size > 0: dram_cls = MemConfig.get(options.dram_type) dram_range = AddrRange(total_size - dram_size, total_size - 1) dram_ctrl = MemConfig.create_mem_ctrl(dram_cls, dram_range, 0, 1, 0, intlv_size) # Set the number of ranks based on the command-line # options if it was explicitly set if issubclass(dram_cls, DRAMCtrl) and options.mem_ranks: dram_ctrl.ranks_per_channel = options.mem_ranks mem_ctrls.append(dram_ctrl) system.mem_ctrls = mem_ctrls # Connect the controllers to the THNVM bus for i in xrange(len(system.mem_ctrls)): system.mem_ctrls[i].port = system.thnvm_bus.master system.thnvm_bus.slave = system.membus.master
def setup_memory_controllers(system, ruby, dir_cntrls, options): ruby.block_size_bytes = options.cacheline_size ruby.memory_size_bits = 48 block_size_bits = int(math.log(options.cacheline_size, 2)) if options.numa_high_bit: numa_bit = options.numa_high_bit else: # if the numa_bit is not specified, set the directory bits as the # lowest bits above the block offset bits, and the numa_bit as the # highest of those directory bits dir_bits = int(math.log(options.num_dirs, 2)) numa_bit = block_size_bits + dir_bits - 1 index = 0 mem_ctrls = [] crossbars = [] # Sets bits to be used for interleaving. Creates memory controllers # attached to a directory controller. A separate controller is created # for each address range as the abstract memory can handle only one # contiguous address range as of now. for dir_cntrl in dir_cntrls: dir_cntrl.directory.numa_high_bit = numa_bit crossbar = None if buildEnv['TARGET_ISA'] != "arm": if len(system.mem_ranges) > 1: crossbar = IOXBar() crossbars.append(crossbar) dir_cntrl.memory = crossbar.slave else: #connect to iobus crossbar dir_cntrl.memory = system.iobus.slave for r in system.mem_ranges: mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), r, index, options.num_dirs, int(math.log(options.num_dirs, 2)), options.cacheline_size) mem_ctrls.append(mem_ctrl) if buildEnv['TARGET_ISA'] != "arm": if crossbar != None: mem_ctrl.port = crossbar.master else: mem_ctrl.port = dir_cntrl.memory else: #ARM mem_ctrl.port = system.iobus.master index += 1 system.mem_ctrls = mem_ctrls if buildEnv['TARGET_ISA'] != "arm": if len(crossbars) > 0: ruby.crossbars = crossbars
def jiwon_config_pim( test_sys, options ): (TestCPUClass, test_mem_mode, FutureClass) = Simulation.setCPUClass(options) # create PIM vault ctrls cls = MemConfig.get(options.pim_mem_type) pim_vault_ctrls = [] for i in xrange(options.num_pim_sys): vault_range = AddrRange(PIM_VAULT_BASE_ADDR + i*DRAM_CHANNEL_SIZE_INT, size=DRAM_CHANNEL_SIZE_MB) pim_vault = ethz_create_mem_ctrl(cls, vault_range, i=0, nbr_mem_ctrls=1, intlv_bits=0, cache_line_size=test_sys.cache_line_size.value) pim_vault_ctrls.append(pim_vault) test_sys.pim_vault_ctrls = pim_vault_ctrls # create PIM cores test_sys.pim_sys = [build_pim_system(options, test_mem_mode, TestCPUClass, pim_id=i) for i in xrange(options.num_pim_sys)] for i in xrange(options.num_pim_sys): pim_device_range = AddrRange(PIM_ADDRESS_BASE + i*PIM_ADDRESS_SIZE_INT, size = PIM_ADDRESS_SIZE) #physical address range pim_vault_phys_addr_base = PIM_VAULT_BASE_ADDR + i*DRAM_CHANNEL_SIZE_INT pim_vault_range = AddrRange(pim_vault_phys_addr_base, size=DRAM_CHANNEL_SIZE_MB) #physical address range test_sys.pim_sys[i].p2s = Bridge(ranges=pim_vault_range, delay='0.01ns', req_size=32, resp_size=32) test_sys.pim_sys[i].s2p = Bridge(ranges=pim_device_range, delay='0.01ns', req_size=32, resp_size=32) # add vault address ranges to TLB pim_vault_virt_addr_base = 0xC0000000 #JIWON: TODO-- need to change this to something more reasonable.... test_sys.pim_sys[i].itlb.original_ranges.append(AddrRange(pim_vault_virt_addr_base, size=DRAM_CHANNEL_SIZE_MB)) test_sys.pim_sys[i].itlb.remapped_ranges.append(AddrRange(pim_vault_phys_addr_base, size=DRAM_CHANNEL_SIZE_MB)) test_sys.pim_sys[i].dtlb.original_ranges.append(AddrRange(pim_vault_virt_addr_base, size=DRAM_CHANNEL_SIZE_MB)) test_sys.pim_sys[i].dtlb.remapped_ranges.append(AddrRange(pim_vault_phys_addr_base, size=DRAM_CHANNEL_SIZE_MB)) test_sys.pim_sys[i].stlb.original_ranges.append(AddrRange(pim_vault_virt_addr_base, size=DRAM_CHANNEL_SIZE_MB)) test_sys.pim_sys[i].stlb.remapped_ranges.append(AddrRange(pim_vault_phys_addr_base, size=DRAM_CHANNEL_SIZE_MB)) if ( GEM5_ENABLE_COMM_MONITORS == "TRUE" ): test_sys.pim_sys[i].Smon = CommMonitor() if ( SMON_DUMP_ADDRESS == "TRUE" ): test_sys.pim_sys[i].Smon.dump_addresses=True test_sys.pim_sys[i].Smon.dump_file="m5out/smon_addr_dump.txt" test_sys.pim_sys[i].pimbus.master = test_sys.pim_sys[i].Smon.slave test_sys.pim_sys[i].Smon.master = test_sys.pim_sys[i].p2s.slave else: test_sys.pim_sys[i].pimbus.master = test_sys.pim_sys[i].p2s.slave test_sys.pim_sys[i].s2p.master = test_sys.pim_sys[i].pimbus.slave if ( MOVE_PIM_TO_HOST == "FALSE" ): test_sys.smcxbar.master = test_sys.pim_sys[i].s2p.slave # connect PIM core to system test_sys.pim_vault_ctrls[i].port = test_sys.pim_sys[i].p2s.master # connect PIM vault to PIM core else: test_sys.pim_sys[i].p2s.master = test_sys.membus.slave test_sys.membus.master = test_sys.pim_sys[i].s2p.slave;
def setup_memory_controllers(system, ruby, dir_cntrls, options): ruby.block_size_bytes = options.cacheline_size ruby.memory_size_bits = 48 block_size_bits = int(math.log(options.cacheline_size, 2)) if options.numa_high_bit: numa_bit = options.numa_high_bit else: # if the numa_bit is not specified, set the directory bits as the # lowest bits above the block offset bits, and the numa_bit as the # highest of those directory bits dir_bits = int(math.log(options.num_dirs, 2)) numa_bit = block_size_bits + dir_bits - 1 index = 0 mem_ctrls = [] crossbars = [] # Sets bits to be used for interleaving. Creates memory controllers # attached to a directory controller. A separate controller is created # for each address range as the abstract memory can handle only one # contiguous address range as of now. for dir_cntrl in dir_cntrls: dir_cntrl.directory.numa_high_bit = numa_bit crossbar = None if len(system.mem_ranges) > 1: crossbar = NoncoherentXBar() crossbars.append(crossbar) dir_cntrl.memory = crossbar.slave for r in system.mem_ranges: mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), r, index, options.num_dirs, int(math.log(options.num_dirs, 2)), options.cacheline_size) mem_ctrls.append(mem_ctrl) if crossbar != None: mem_ctrl.port = crossbar.master else: mem_ctrl.port = dir_cntrl.memory index += 1 system.mem_ctrls = mem_ctrls if len(crossbars) > 0: ruby.crossbars = crossbars
def ethz_config_mem(options, system): """ Create the memory controllers based on the options and attach them. If requested, we make a multi-channel configuration of the selected memory controller class by creating multiple instances of the specific class. The individual controllers have their parameters set such that the address range is interleaved between them. """ nbr_mem_ctrls = options.mem_channels import math from m5.util import fatal intlv_bits = int(math.log(nbr_mem_ctrls, 2)) if 2**intlv_bits != nbr_mem_ctrls: fatal("Number of memory channels must be a power of 2") cls = MemConfig.get(options.mem_type) mem_ctrls = [] # For every range (most systems will only have one), create an # array of controllers and set their parameters to match their # address mapping in the case of a DRAM for r in system.mem_ranges: for i in xrange(nbr_mem_ctrls): mem_ctrls.append( ethz_create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits, system.cache_line_size.value)) system.mem_ctrls = mem_ctrls # Connect the controllers to the smcxbar for i in xrange(len(system.mem_ctrls)): system.mem_ctrls[i].port = system.smcxbar.master # Modified by Erfan ethz_print_val("system.cache_line_size.value (bytes)", system.cache_line_size.value) if (options.mem_type != "ethz_ModelsimIF"): ethz_print_val("number of vaults", nbr_mem_ctrls)
def setMemClass(options): """Returns a memory controller class.""" return MemConfig.get(options.mem_type)
def setup_memory_controllers(system, ruby, dir_cntrls, options): ruby.block_size_bytes = options.cacheline_size ruby.memory_size_bits = 48 block_size_bits = int(math.log(options.cacheline_size, 2)) if options.numa_high_bit: numa_bit = options.numa_high_bit else: # if the numa_bit is not specified, set the directory bits as the # lowest bits above the block offset bits, and the numa_bit as the # highest of those directory bits dir_bits = int(math.log(options.num_dirs, 2)) numa_bit = block_size_bits + dir_bits - 1 index = 0 mem_ctrls = [] crossbars = [] # Sets bits to be used for interleaving. Creates memory controllers # attached to a directory controller. A separate controller is created # for each address range as the abstract memory can handle only one # contiguous address range as of now. for dir_cntrl in dir_cntrls: # Create 1 instance of DRAMCache per directory controller if options.dramcache: dramcache_ctrl = MemConfig.create_dramcache_ctrl( MemConfig.get_cache(options.dramcache_type), system.mem_ranges[0], index, options.num_dirs, options.dramcache_size, options.dramcache_assoc, options.dramcache_block_size, options.num_cpus, options.dramcache_timing) mem_ctrls.append(dramcache_ctrl) dir_cntrl.memory = dramcache_ctrl.port dir_cntrl.directory.numa_high_bit = numa_bit crossbar = None if len(system.mem_ranges) > 1: # we dont support this fatal("system mem_ranges greater than 1") crossbar = IOXBar() crossbars.append(crossbar) if options.dramcache: dramcache_ctrl.dramcache_masterport = crossbar.slave else: dir_cntrl.memory = crossbar.slave for r in system.mem_ranges: # if dramcache exists interleave at dramcache_block_size if options.dramcache: mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), r, index, options.num_dirs, int(math.log(options.num_dirs, 2)), options.dramcache_block_size) else: mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), r, index, options.num_dirs, int(math.log(options.num_dirs, 2)), options.cacheline_size) mem_ctrls.append(mem_ctrl) if crossbar != None: mem_ctrl.port = crossbar.master else: if options.dramcache: mem_ctrl.port = dramcache_ctrl.dramcache_masterport else: mem_ctrl.port = dir_cntrl.memory index += 1 system.mem_ctrls = mem_ctrls if len(crossbars) > 0: ruby.crossbars = crossbars
def create_system(options, full_system, system, dma_devices, ruby_system): if not buildEnv['GPGPU_SIM']: m5.util.panic( "This script requires GPGPU-Sim integration to be built.") # Run the protocol script to setup CPU cluster, directory and DMA (all_sequencers, dir_cntrls, dma_cntrls, cpu_cluster) = \ VI_hammer.create_system(options, full_system, system, dma_devices, ruby_system) # If we're going to split the directories/memory controllers if options.num_dev_dirs > 0: cpu_cntrl_count = len(cpu_cluster) else: cpu_cntrl_count = len(cpu_cluster) + len(dir_cntrls) # # Create controller for the copy engine to connect to in CPU cluster # Cache is unused by controller # cache = L1Cache(size="4096B", assoc=2) cpu_ce_seq = RubySequencer(version=options.num_cpus + options.num_sc, icache=cache, dcache=cache, max_outstanding_requests=64, ruby_system=ruby_system, connect_to_io=False) cpu_ce_cntrl = GPUCopyDMA_Controller(version=0, sequencer=cpu_ce_seq, number_of_TBEs=256, transitions_per_cycle=options.ports, ruby_system=ruby_system) cpu_ce_cntrl.responseFromDir = MessageBuffer(ordered=True) cpu_ce_cntrl.responseFromDir.slave = ruby_system.network.master cpu_ce_cntrl.reqToDirectory = MessageBuffer(ordered=True) cpu_ce_cntrl.reqToDirectory.master = ruby_system.network.slave cpu_ce_cntrl.mandatoryQueue = MessageBuffer() ruby_system.ce_cntrl = cpu_ce_cntrl cpu_cntrl_count += 1 # # Build GPU cluster # gpu_cluster = Cluster(intBW=32, extBW=32) gpu_cluster.disableConnectToParent() l2_bits = int(math.log(options.num_l2caches, 2)) block_size_bits = int(math.log(options.cacheline_size, 2)) # This represents the L1 to L2 interconnect latency # NOTE! This latency is in Ruby (cache) cycles, not SM cycles per_hop_interconnect_latency = 45 # ~15 GPU cycles num_dance_hall_hops = int(math.log(options.num_sc, 2)) if num_dance_hall_hops == 0: num_dance_hall_hops = 1 l1_to_l2_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops # # Caches for GPU cores # for i in xrange(options.num_sc): # # First create the Ruby objects associated with the GPU cores # cache = L1Cache(size=options.sc_l1_size, assoc=options.sc_l1_assoc, replacement_policy=LRUReplacementPolicy(), start_index_bit=block_size_bits, dataArrayBanks=4, tagArrayBanks=4, dataAccessLatency=4, tagAccessLatency=4, resourceStalls=False) l1_cntrl = GPUL1Cache_Controller( version=i, cache=cache, l2_select_num_bits=l2_bits, num_l2=options.num_l2caches, transitions_per_cycle=options.ports, issue_latency=l1_to_l2_noc_latency, number_of_TBEs=options.gpu_l1_buf_depth, ruby_system=ruby_system) gpu_seq = RubySequencer( version=options.num_cpus + i, icache=cache, dcache=cache, max_outstanding_requests=options.gpu_l1_buf_depth, ruby_system=ruby_system, deadlock_threshold=2000000, connect_to_io=False) l1_cntrl.sequencer = gpu_seq exec("ruby_system.l1_cntrl_sp%02d = l1_cntrl" % i) # # Add controllers and sequencers to the appropriate lists # all_sequencers.append(gpu_seq) gpu_cluster.add(l1_cntrl) # Connect the controller to the network l1_cntrl.requestFromL1Cache = MessageBuffer(ordered=True) l1_cntrl.requestFromL1Cache.master = ruby_system.network.slave l1_cntrl.responseToL1Cache = MessageBuffer(ordered=True) l1_cntrl.responseToL1Cache.slave = ruby_system.network.master l1_cntrl.mandatoryQueue = MessageBuffer() l2_index_start = block_size_bits + l2_bits # Use L2 cache and interconnect latencies to calculate protocol latencies # NOTE! These latencies are in Ruby (cache) cycles, not SM cycles l2_cache_access_latency = 30 # ~10 GPU cycles l2_to_l1_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops l2_to_mem_noc_latency = 125 # ~40 GPU cycles l2_clusters = [] for i in xrange(options.num_l2caches): # # First create the Ruby objects associated with this cpu # l2_cache = L2Cache(size=options.sc_l2_size, assoc=options.sc_l2_assoc, start_index_bit=l2_index_start, replacement_policy=LRUReplacementPolicy(), dataArrayBanks=4, tagArrayBanks=4, dataAccessLatency=4, tagAccessLatency=4, resourceStalls=options.gpu_l2_resource_stalls) l2_cntrl = GPUL2Cache_Controller( version=i, L2cache=l2_cache, transitions_per_cycle=options.ports, l2_response_latency=l2_cache_access_latency + l2_to_l1_noc_latency, l2_request_latency=l2_to_mem_noc_latency, cache_response_latency=l2_cache_access_latency, ruby_system=ruby_system) exec("ruby_system.l2_cntrl%d = l2_cntrl" % i) l2_cluster = Cluster(intBW=32, extBW=32) l2_cluster.add(l2_cntrl) gpu_cluster.add(l2_cluster) l2_clusters.append(l2_cluster) # Connect the controller to the network l2_cntrl.responseToL1Cache = MessageBuffer(ordered=True) l2_cntrl.responseToL1Cache.master = ruby_system.network.slave l2_cntrl.requestFromCache = MessageBuffer() l2_cntrl.requestFromCache.master = ruby_system.network.slave l2_cntrl.responseFromCache = MessageBuffer() l2_cntrl.responseFromCache.master = ruby_system.network.slave l2_cntrl.unblockFromCache = MessageBuffer() l2_cntrl.unblockFromCache.master = ruby_system.network.slave l2_cntrl.requestFromL1Cache = MessageBuffer(ordered=True) l2_cntrl.requestFromL1Cache.slave = ruby_system.network.master l2_cntrl.forwardToCache = MessageBuffer() l2_cntrl.forwardToCache.slave = ruby_system.network.master l2_cntrl.responseToCache = MessageBuffer() l2_cntrl.responseToCache.slave = ruby_system.network.master l2_cntrl.triggerQueue = MessageBuffer() gpu_phys_mem_size = system.gpu.gpu_memory_range.size() if options.num_dev_dirs > 0: mem_module_size = gpu_phys_mem_size / options.num_dev_dirs # # determine size and index bits for probe filter # By default, the probe filter size is configured to be twice the # size of the L2 cache. # pf_size = MemorySize(options.sc_l2_size) pf_size.value = pf_size.value * 2 dir_bits = int(math.log(options.num_dev_dirs, 2)) pf_bits = int(math.log(pf_size.value, 2)) if options.numa_high_bit: if options.pf_on or options.dir_on: # if numa high bit explicitly set, make sure it does not overlap # with the probe filter index assert (options.numa_high_bit - dir_bits > pf_bits) # set the probe filter start bit to just above the block offset pf_start_bit = block_size_bits else: if dir_bits > 0: pf_start_bit = dir_bits + block_size_bits - 1 else: pf_start_bit = block_size_bits dev_dir_cntrls = [] dev_mem_ctrls = [] num_cpu_dirs = len(dir_cntrls) for i in xrange(options.num_dev_dirs): # # Create the Ruby objects associated with the directory controller # dir_version = i + num_cpu_dirs dir_size = MemorySize('0B') dir_size.value = mem_module_size pf = ProbeFilter(size=pf_size, assoc=4, start_index_bit=pf_start_bit) dev_dir_cntrl = Directory_Controller(version = dir_version, directory = \ RubyDirectoryMemory( \ version = dir_version, size = dir_size, numa_high_bit = \ options.numa_high_bit, device_directory = True), probeFilter = pf, probe_filter_enabled = options.pf_on, full_bit_dir_enabled = options.dir_on, transitions_per_cycle = options.ports, ruby_system = ruby_system) if options.recycle_latency: dev_dir_cntrl.recycle_latency = options.recycle_latency exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i) dev_dir_cntrls.append(dev_dir_cntrl) # Connect the directory controller to the network dev_dir_cntrl.forwardFromDir = MessageBuffer() dev_dir_cntrl.forwardFromDir.master = ruby_system.network.slave dev_dir_cntrl.responseFromDir = MessageBuffer() dev_dir_cntrl.responseFromDir.master = ruby_system.network.slave dev_dir_cntrl.dmaResponseFromDir = MessageBuffer(ordered=True) dev_dir_cntrl.dmaResponseFromDir.master = ruby_system.network.slave dev_dir_cntrl.unblockToDir = MessageBuffer() dev_dir_cntrl.unblockToDir.slave = ruby_system.network.master dev_dir_cntrl.responseToDir = MessageBuffer() dev_dir_cntrl.responseToDir.slave = ruby_system.network.master dev_dir_cntrl.requestToDir = MessageBuffer() dev_dir_cntrl.requestToDir.slave = ruby_system.network.master dev_dir_cntrl.dmaRequestToDir = MessageBuffer(ordered=True) dev_dir_cntrl.dmaRequestToDir.slave = ruby_system.network.master dev_dir_cntrl.triggerQueue = MessageBuffer(ordered=True) dev_dir_cntrl.responseFromMemory = MessageBuffer() dev_mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), system.gpu.gpu_memory_range, i, options.num_dev_dirs, int(math.log(options.num_dev_dirs, 2)), options.cacheline_size) dev_mem_ctrl.port = dev_dir_cntrl.memory dev_mem_ctrls.append(dev_mem_ctrl) system.dev_mem_ctrls = dev_mem_ctrls else: # Since there are no device directories, use CPU directories # Fix up the memory sizes of the CPU directories num_dirs = len(dir_cntrls) add_gpu_mem = gpu_phys_mem_size / num_dirs for cntrl in dir_cntrls: new_size = cntrl.directory.size.value + add_gpu_mem cntrl.directory.size.value = new_size # # Create controller for the copy engine to connect to in GPU cluster # Cache is unused by controller # cache = L1Cache(size="4096B", assoc=2) gpu_ce_seq = RubySequencer(version=options.num_cpus + options.num_sc + 1, icache=cache, dcache=cache, max_outstanding_requests=64, support_inst_reqs=False, ruby_system=ruby_system, connect_to_io=False) gpu_ce_cntrl = GPUCopyDMA_Controller(version=1, sequencer=gpu_ce_seq, number_of_TBEs=256, transitions_per_cycle=options.ports, ruby_system=ruby_system) ruby_system.dev_ce_cntrl = gpu_ce_cntrl all_sequencers.append(cpu_ce_seq) all_sequencers.append(gpu_ce_seq) gpu_ce_cntrl.responseFromDir = MessageBuffer(ordered=True) gpu_ce_cntrl.responseFromDir.slave = ruby_system.network.master gpu_ce_cntrl.reqToDirectory = MessageBuffer(ordered=True) gpu_ce_cntrl.reqToDirectory.master = ruby_system.network.slave gpu_ce_cntrl.mandatoryQueue = MessageBuffer() complete_cluster = Cluster(intBW=32, extBW=32) complete_cluster.add(cpu_ce_cntrl) complete_cluster.add(gpu_ce_cntrl) complete_cluster.add(cpu_cluster) complete_cluster.add(gpu_cluster) for cntrl in dir_cntrls: complete_cluster.add(cntrl) for cntrl in dev_dir_cntrls: complete_cluster.add(cntrl) for cntrl in dma_cntrls: complete_cluster.add(cntrl) for cluster in l2_clusters: complete_cluster.add(cluster) return (all_sequencers, dir_cntrls, complete_cluster)
def create_system(options, full_system, system, dma_devices, ruby_system): if not buildEnv['GPGPU_SIM']: m5.util.panic("This script requires GPGPU-Sim integration to be built.") # Run the protocol script to setup CPU cluster, directory and DMA (all_sequencers, dir_cntrls, dma_cntrls, cpu_cluster) = \ VI_hammer.create_system(options, full_system, system, dma_devices, ruby_system) # If we're going to split the directories/memory controllers if options.num_dev_dirs > 0: cpu_cntrl_count = len(cpu_cluster) else: cpu_cntrl_count = len(cpu_cluster) + len(dir_cntrls) # # Create controller for the copy engine to connect to in CPU cluster # Cache is unused by controller # cache = L1Cache(size = "4096B", assoc = 2) cpu_ce_seq = RubySequencer(version = options.num_cpus + options.num_sc, icache = cache, dcache = cache, max_outstanding_requests = 64, ruby_system = ruby_system, connect_to_io = False) cpu_ce_cntrl = GPUCopyDMA_Controller(version = 0, sequencer = cpu_ce_seq, number_of_TBEs = 256, ruby_system = ruby_system) cpu_cntrl_count += 1 cpu_ce_cntrl.responseFromDir = ruby_system.network.master cpu_ce_cntrl.reqToDirectory = ruby_system.network.slave # # Build GPU cluster # gpu_cluster = Cluster(intBW = 32, extBW = 32) gpu_cluster.disableConnectToParent() l2_bits = int(math.log(options.num_l2caches, 2)) block_size_bits = int(math.log(options.cacheline_size, 2)) # This represents the L1 to L2 interconnect latency # NOTE! This latency is in Ruby (cache) cycles, not SM cycles per_hop_interconnect_latency = 45 # ~15 GPU cycles num_dance_hall_hops = int(math.log(options.num_sc, 2)) if num_dance_hall_hops == 0: num_dance_hall_hops = 1 l1_to_l2_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops # # Caches for GPU cores # for i in xrange(options.num_sc): # # First create the Ruby objects associated with the GPU cores # cache = L1Cache(size = options.sc_l1_size, assoc = options.sc_l1_assoc, replacement_policy = "LRU", start_index_bit = block_size_bits, dataArrayBanks = 4, tagArrayBanks = 4, dataAccessLatency = 4, tagAccessLatency = 4, resourceStalls = False) l1_cntrl = GPUL1Cache_Controller(version = i, cache = cache, l2_select_num_bits = l2_bits, num_l2 = options.num_l2caches, issue_latency = l1_to_l2_noc_latency, number_of_TBEs = options.gpu_l1_buf_depth, ruby_system = ruby_system) gpu_seq = RubySequencer(version = options.num_cpus + i, icache = cache, dcache = cache, max_outstanding_requests = options.gpu_l1_buf_depth, ruby_system = ruby_system, deadlock_threshold = 2000000, connect_to_io = False) l1_cntrl.sequencer = gpu_seq exec("ruby_system.l1_cntrl_sp%02d = l1_cntrl" % i) # # Add controllers and sequencers to the appropriate lists # all_sequencers.append(gpu_seq) gpu_cluster.add(l1_cntrl) # Connect the controller to the network l1_cntrl.requestFromL1Cache = ruby_system.network.slave l1_cntrl.responseToL1Cache = ruby_system.network.master l2_index_start = block_size_bits + l2_bits # Use L2 cache and interconnect latencies to calculate protocol latencies # NOTE! These latencies are in Ruby (cache) cycles, not SM cycles l2_cache_access_latency = 30 # ~10 GPU cycles l2_to_l1_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops l2_to_mem_noc_latency = 125 # ~40 GPU cycles l2_clusters = [] for i in xrange(options.num_l2caches): # # First create the Ruby objects associated with this cpu # l2_cache = L2Cache(size = options.sc_l2_size, assoc = options.sc_l2_assoc, start_index_bit = l2_index_start, replacement_policy = "LRU", dataArrayBanks = 4, tagArrayBanks = 4, dataAccessLatency = 4, tagAccessLatency = 4, resourceStalls = options.gpu_l2_resource_stalls) region_buffer = regionBuffer_Obj(size = "8MB", assoc = 2^16, start_index_bit = l2_index_start, replacement_policy = "LRU", dataArrayBanks = 4, tagArrayBanks = 4, dataAccessLatency = 4, tagAccessLatency = 4, resourceStalls = options.gpu_l2_resource_stalls, regionSize = options.region_size) l2_cntrl = GPUL2Cache_Controller(version = i, L2cache = l2_cache, regionBuffer = region_buffer, l2_response_latency = l2_cache_access_latency + l2_to_l1_noc_latency, l2_request_latency = l2_to_mem_noc_latency, cache_response_latency = l2_cache_access_latency, ruby_system = ruby_system) exec("ruby_system.l2_cntrl%d = l2_cntrl" % i) l2_cluster = Cluster(intBW = 32, extBW = 32) l2_cluster.add(l2_cntrl) gpu_cluster.add(l2_cluster) l2_clusters.append(l2_cluster) # Connect the controller to the network l2_cntrl.responseToL1Cache = ruby_system.network.slave l2_cntrl.requestFromCache = ruby_system.network.slave l2_cntrl.responseFromCache = ruby_system.network.slave l2_cntrl.unblockFromCache = ruby_system.network.slave l2_cntrl.requestFromL1Cache = ruby_system.network.master l2_cntrl.forwardToCache = ruby_system.network.master l2_cntrl.responseToCache = ruby_system.network.master gpu_phys_mem_size = system.gpu.gpu_memory_range.size() if options.num_dev_dirs > 0: mem_module_size = gpu_phys_mem_size / options.num_dev_dirs # # determine size and index bits for probe filter # By default, the probe filter size is configured to be twice the # size of the L2 cache. # pf_size = MemorySize(options.sc_l2_size) pf_size.value = pf_size.value * 2 dir_bits = int(math.log(options.num_dev_dirs, 2)) pf_bits = int(math.log(pf_size.value, 2)) if options.numa_high_bit: if options.pf_on or options.dir_on: # if numa high bit explicitly set, make sure it does not overlap # with the probe filter index assert(options.numa_high_bit - dir_bits > pf_bits) # set the probe filter start bit to just above the block offset pf_start_bit = block_size_bits else: if dir_bits > 0: pf_start_bit = dir_bits + block_size_bits - 1 else: pf_start_bit = block_size_bits dev_dir_cntrls = [] dev_mem_ctrls = [] num_cpu_dirs = len(dir_cntrls) for i in xrange(options.num_dev_dirs): # # Create the Ruby objects associated with the directory controller # dir_version = i + num_cpu_dirs dir_size = MemorySize('0B') dir_size.value = mem_module_size pf = ProbeFilter(size = pf_size, assoc = 4, start_index_bit = pf_start_bit) dev_dir_cntrl = Directory_Controller(version = dir_version, directory = \ RubyDirectoryMemory( \ version = dir_version, size = dir_size, numa_high_bit = \ options.numa_high_bit, device_directory = True), probeFilter = pf, probe_filter_enabled = options.pf_on, full_bit_dir_enabled = options.dir_on, ruby_system = ruby_system) if options.recycle_latency: dev_dir_cntrl.recycle_latency = options.recycle_latency exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i) dev_dir_cntrls.append(dev_dir_cntrl) # Connect the directory controller to the network dev_dir_cntrl.forwardFromDir = ruby_system.network.slave dev_dir_cntrl.responseFromDir = ruby_system.network.slave dev_dir_cntrl.dmaResponseFromDir = ruby_system.network.slave dev_dir_cntrl.unblockToDir = ruby_system.network.master dev_dir_cntrl.responseToDir = ruby_system.network.master dev_dir_cntrl.requestToDir = ruby_system.network.master dev_dir_cntrl.dmaRequestToDir = ruby_system.network.master dev_mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), system.gpu.gpu_memory_range, i, options.num_dev_dirs, int(math.log(options.num_dev_dirs, 2)), options.cacheline_size) dev_mem_ctrl.port = dev_dir_cntrl.memory dev_mem_ctrls.append(dev_mem_ctrl) system.dev_mem_ctrls = dev_mem_ctrls else: # Since there are no device directories, use CPU directories # Fix up the memory sizes of the CPU directories num_dirs = len(dir_cntrls) add_gpu_mem = gpu_phys_mem_size / num_dirs for cntrl in dir_cntrls: new_size = cntrl.directory.size.value + add_gpu_mem cntrl.directory.size.value = new_size # # Create controller for the copy engine to connect to in GPU cluster # Cache is unused by controller # cache = L1Cache(size = "4096B", assoc = 2) gpu_ce_seq = RubySequencer(version = options.num_cpus + options.num_sc + 1, icache = cache, dcache = cache, max_outstanding_requests = 64, support_inst_reqs = False, ruby_system = ruby_system, connect_to_io = False) gpu_ce_cntrl = GPUCopyDMA_Controller(version = 1, sequencer = gpu_ce_seq, number_of_TBEs = 256, ruby_system = ruby_system) ruby_system.l1_cntrl_ce = gpu_ce_cntrl all_sequencers.append(cpu_ce_seq) all_sequencers.append(gpu_ce_seq) gpu_ce_cntrl.responseFromDir = ruby_system.network.master gpu_ce_cntrl.reqToDirectory = ruby_system.network.slave complete_cluster = Cluster(intBW = 32, extBW = 32) complete_cluster.add(cpu_ce_cntrl) complete_cluster.add(gpu_ce_cntrl) complete_cluster.add(cpu_cluster) complete_cluster.add(gpu_cluster) for cntrl in dir_cntrls: complete_cluster.add(cntrl) for cntrl in dev_dir_cntrls: complete_cluster.add(cntrl) for cntrl in dma_cntrls: complete_cluster.add(cntrl) for cluster in l2_clusters: complete_cluster.add(cluster) return (all_sequencers, dir_cntrls, complete_cluster)
def create_system(options, full_system, system, dma_ports, ruby_system): if not buildEnv['GPGPU_SIM']: m5.util.panic("This script requires GPGPU-Sim integration to be built.") options.access_backing_store = True # Run the original protocol script buildEnv['PROTOCOL'] = buildEnv['PROTOCOL'].replace('split', 'fusion') protocol = buildEnv['PROTOCOL'] exec "import %s" % protocol try: (cpu_sequencers, dir_cntrl_nodes, topology) = \ eval("%s.create_system(options, full_system, system, dma_ports, ruby_system)" % protocol) except: print "Error: could not create system for ruby protocol inside fusion system %s" % protocol raise # Faking things to build the rest of the system print "Warning!" print "Warning: Faking split MOESI_hammer protocol; collecting checkpoints?" print "Warning!" if options.num_dev_dirs > 0: block_size_bits = int(math.log(options.cacheline_size, 2)) gpu_phys_mem_size = system.gpu.gpu_memory_range.size() mem_module_size = gpu_phys_mem_size / options.num_dev_dirs # # determine size and index bits for probe filter # By default, the probe filter size is configured to be twice the # size of the L2 cache. # pf_size = MemorySize(options.sc_l2_size) pf_size.value = pf_size.value * 2 dir_bits = int(math.log(options.num_dev_dirs, 2)) pf_bits = int(math.log(pf_size.value, 2)) if options.numa_high_bit: if options.pf_on or options.dir_on: # if numa high bit explicitly set, make sure it does not overlap # with the probe filter index assert(options.numa_high_bit - dir_bits > pf_bits) # set the probe filter start bit to just above the block offset pf_start_bit = block_size_bits else: if dir_bits > 0: pf_start_bit = dir_bits + block_size_bits - 1 else: pf_start_bit = block_size_bits dev_dir_cntrls = [] dev_mem_ctrls = [] num_cpu_dirs = len(dir_cntrl_nodes) for i in xrange(options.num_dev_dirs): # # Create the Ruby objects associated with the directory controller # dir_version = i + num_cpu_dirs dir_size = MemorySize('0B') dir_size.value = mem_module_size pf = ProbeFilter(size = pf_size, assoc = 4, start_index_bit = pf_start_bit) dev_dir_cntrl = Directory_Controller(version = dir_version, directory = \ RubyDirectoryMemory( \ version = dir_version, size = dir_size, numa_high_bit = \ options.numa_high_bit, device_directory = True), probeFilter = pf, probe_filter_enabled = options.pf_on, full_bit_dir_enabled = options.dir_on, ruby_system = ruby_system) if options.recycle_latency: dev_dir_cntrl.recycle_latency = options.recycle_latency exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i) dev_dir_cntrls.append(dev_dir_cntrl) # Connect the directory controller to the network dev_dir_cntrl.forwardFromDir = ruby_system.network.slave dev_dir_cntrl.responseFromDir = ruby_system.network.slave dev_dir_cntrl.dmaResponseFromDir = ruby_system.network.slave dev_dir_cntrl.unblockToDir = ruby_system.network.master dev_dir_cntrl.responseToDir = ruby_system.network.master dev_dir_cntrl.requestToDir = ruby_system.network.master dev_dir_cntrl.dmaRequestToDir = ruby_system.network.master dev_mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), system.gpu.gpu_memory_range, i, options.num_dev_dirs, int(math.log(options.num_dev_dirs, 2)), options.cacheline_size) dev_mem_ctrl.port = dev_dir_cntrl.memory dev_mem_ctrls.append(dev_mem_ctrl) topology.addController(dev_dir_cntrl) system.dev_mem_ctrls = dev_mem_ctrls # # Create controller for the copy engine to connect to in GPU cluster # Cache is unused by controller # block_size_bits = int(math.log(options.cacheline_size, 2)) l1i_cache = L1Cache(size = "2kB", assoc = 2) l1d_cache = L1Cache(size = "2kB", assoc = 2) l2_cache = L2Cache(size = "2kB", assoc = 2, start_index_bit = block_size_bits) l1_cntrl = L1Cache_Controller(version = options.num_cpus + options.num_sc, L1Icache = l1i_cache, L1Dcache = l1d_cache, L2cache = l2_cache, no_mig_atomic = not \ options.allow_atomic_migration, send_evictions = ( options.cpu_type == "detailed"), ruby_system = ruby_system) gpu_ce_seq = RubySequencer(version = options.num_cpus + options.num_sc, icache = l1i_cache, dcache = l1d_cache, max_outstanding_requests = 64, ruby_system = ruby_system, connect_to_io = False) l1_cntrl.sequencer = gpu_ce_seq ruby_system.l1_cntrl_gpuce = l1_cntrl cpu_sequencers.append(gpu_ce_seq) topology.addController(l1_cntrl) # Connect the L1 controller and the network # Connect the buffers from the controller to network l1_cntrl.requestFromCache = ruby_system.network.slave l1_cntrl.responseFromCache = ruby_system.network.slave l1_cntrl.unblockFromCache = ruby_system.network.slave # Connect the buffers from the network to the controller l1_cntrl.forwardToCache = ruby_system.network.master l1_cntrl.responseToCache = ruby_system.network.master return (cpu_sequencers, dir_cntrl_nodes, topology)
def create_system(options, full_system, system, dma_ports, ruby_system): if not buildEnv['GPGPU_SIM']: m5.util.panic( "This script requires GPGPU-Sim integration to be built.") options.access_backing_store = True # Run the original protocol script buildEnv['PROTOCOL'] = buildEnv['PROTOCOL'].replace('split', 'fusion') protocol = buildEnv['PROTOCOL'] exec "import %s" % protocol try: (cpu_sequencers, dir_cntrl_nodes, topology) = \ eval("%s.create_system(options, full_system, system, dma_ports, ruby_system)" % protocol) except: print "Error: could not create system for ruby protocol inside fusion system %s" % protocol raise # Faking things to build the rest of the system print "Warning!" print "Warning: Faking split MOESI_hammer protocol; collecting checkpoints?" print "Warning!" if options.num_dev_dirs > 0: block_size_bits = int(math.log(options.cacheline_size, 2)) gpu_phys_mem_size = system.gpu.gpu_memory_range.size() mem_module_size = gpu_phys_mem_size / options.num_dev_dirs # # determine size and index bits for probe filter # By default, the probe filter size is configured to be twice the # size of the L2 cache. # pf_size = MemorySize(options.sc_l2_size) pf_size.value = pf_size.value * 2 dir_bits = int(math.log(options.num_dev_dirs, 2)) pf_bits = int(math.log(pf_size.value, 2)) if options.numa_high_bit: if options.pf_on or options.dir_on: # if numa high bit explicitly set, make sure it does not overlap # with the probe filter index assert (options.numa_high_bit - dir_bits > pf_bits) # set the probe filter start bit to just above the block offset pf_start_bit = block_size_bits else: if dir_bits > 0: pf_start_bit = dir_bits + block_size_bits - 1 else: pf_start_bit = block_size_bits dev_dir_cntrls = [] dev_mem_ctrls = [] num_cpu_dirs = len(dir_cntrl_nodes) for i in xrange(options.num_dev_dirs): # # Create the Ruby objects associated with the directory controller # dir_version = i + num_cpu_dirs dir_size = MemorySize('0B') dir_size.value = mem_module_size pf = ProbeFilter(size=pf_size, assoc=4, start_index_bit=pf_start_bit) dev_dir_cntrl = Directory_Controller(version = dir_version, directory = \ RubyDirectoryMemory( \ version = dir_version, size = dir_size, numa_high_bit = \ options.numa_high_bit, device_directory = True), probeFilter = pf, probe_filter_enabled = options.pf_on, full_bit_dir_enabled = options.dir_on, transitions_per_cycle = options.ports, ruby_system = ruby_system) if options.recycle_latency: dev_dir_cntrl.recycle_latency = options.recycle_latency exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i) dev_dir_cntrls.append(dev_dir_cntrl) # Connect the directory controller to the network dev_dir_cntrl.forwardFromDir = MessageBuffer() dev_dir_cntrl.forwardFromDir.master = ruby_system.network.slave dev_dir_cntrl.responseFromDir = MessageBuffer() dev_dir_cntrl.responseFromDir.master = ruby_system.network.slave dev_dir_cntrl.dmaResponseFromDir = MessageBuffer(ordered=True) dev_dir_cntrl.dmaResponseFromDir.master = ruby_system.network.slave dev_dir_cntrl.triggerQueue = MessageBuffer(ordered=True) dev_dir_cntrl.unblockToDir = MessageBuffer() dev_dir_cntrl.unblockToDir.slave = ruby_system.network.master dev_dir_cntrl.responseToDir = MessageBuffer() dev_dir_cntrl.responseToDir.slave = ruby_system.network.master dev_dir_cntrl.requestToDir = MessageBuffer() dev_dir_cntrl.requestToDir.slave = ruby_system.network.master dev_dir_cntrl.dmaRequestToDir = MessageBuffer(ordered=True) dev_dir_cntrl.dmaRequestToDir.slave = ruby_system.network.master dev_dir_cntrl.responseFromMemory = MessageBuffer() dev_mem_ctrl = MemConfig.create_mem_ctrl( MemConfig.get(options.mem_type), system.gpu.gpu_memory_range, i, options.num_dev_dirs, int(math.log(options.num_dev_dirs, 2)), options.cacheline_size) dev_mem_ctrl.port = dev_dir_cntrl.memory dev_mem_ctrls.append(dev_mem_ctrl) topology.addController(dev_dir_cntrl) system.dev_mem_ctrls = dev_mem_ctrls # # Create controller for the copy engine to connect to in GPU cluster # Cache is unused by controller # block_size_bits = int(math.log(options.cacheline_size, 2)) l1i_cache = L1Cache(size="2kB", assoc=2) l1d_cache = L1Cache(size="2kB", assoc=2) l2_cache = L2Cache(size="2kB", assoc=2, start_index_bit=block_size_bits) l1_cntrl = L1Cache_Controller(version = options.num_cpus + options.num_sc, L1Icache = l1i_cache, L1Dcache = l1d_cache, L2cache = l2_cache, no_mig_atomic = not \ options.allow_atomic_migration, send_evictions = False, transitions_per_cycle = options.ports, ruby_system = ruby_system) gpu_ce_seq = RubySequencer(version=options.num_cpus + options.num_sc, icache=l1i_cache, dcache=l1d_cache, max_outstanding_requests=64, ruby_system=ruby_system, connect_to_io=False) l1_cntrl.sequencer = gpu_ce_seq ruby_system.dev_ce_cntrl = l1_cntrl cpu_sequencers.append(gpu_ce_seq) topology.addController(l1_cntrl) # Connect the L1 controller and the network # Connect the buffers from the controller to network l1_cntrl.requestFromCache = MessageBuffer() l1_cntrl.requestFromCache.master = ruby_system.network.slave l1_cntrl.responseFromCache = MessageBuffer() l1_cntrl.responseFromCache.master = ruby_system.network.slave l1_cntrl.unblockFromCache = MessageBuffer() l1_cntrl.unblockFromCache.master = ruby_system.network.slave l1_cntrl.triggerQueue = MessageBuffer() # Connect the buffers from the network to the controller l1_cntrl.mandatoryQueue = MessageBuffer() l1_cntrl.forwardToCache = MessageBuffer() l1_cntrl.forwardToCache.slave = ruby_system.network.master l1_cntrl.responseToCache = MessageBuffer() l1_cntrl.responseToCache.slave = ruby_system.network.master return (cpu_sequencers, dir_cntrl_nodes, topology)