Beispiel #1
0
def config_hybrid_mem(options, system):
    """
    Assign proper address ranges for DRAM and NVM controllers.
    Create memory controllers and add their shared bus to the system.
    """
    system.thnvm_bus = VirtualXBar()
    mem_ctrls = []

    # The default behaviour is to interleave memory channels on 128
    # byte granularity, or cache line granularity if larger than 128
    # byte. This value is based on the locality seen across a large
    # range of workloads.
    intlv_size = max(128, system.cache_line_size.value)

    total_size = Addr(options.mem_size)
    dram_size = pow(2, options.page_bits) * options.ptt_length

    if dram_size < total_size.value:
        nvm_cls = MemConfig.get(options.nvm_type)
        nvm_range = AddrRange(0, total_size - dram_size - 1)
        nvm_ctrl = MemConfig.create_mem_ctrl(nvm_cls, nvm_range, 0, 1, 0,
                                             intlv_size)
        # Set the number of ranks based on the command-line
        # options if it was explicitly set
        if issubclass(nvm_cls, DRAMCtrl) and options.mem_ranks:
            nvm_ctrl.ranks_per_channel = options.mem_ranks

        mem_ctrls.append(nvm_ctrl)

    if dram_size > 0:
        dram_cls = MemConfig.get(options.dram_type)
        dram_range = AddrRange(total_size - dram_size, total_size - 1)
        dram_ctrl = MemConfig.create_mem_ctrl(dram_cls, dram_range, 0, 1, 0,
                                              intlv_size)
        # Set the number of ranks based on the command-line
        # options if it was explicitly set
        if issubclass(dram_cls, DRAMCtrl) and options.mem_ranks:
            dram_ctrl.ranks_per_channel = options.mem_ranks

        mem_ctrls.append(dram_ctrl)

    system.mem_ctrls = mem_ctrls

    # Connect the controllers to the THNVM bus
    for i in xrange(len(system.mem_ctrls)):
        system.mem_ctrls[i].port = system.thnvm_bus.master

    system.thnvm_bus.slave = system.membus.master
Beispiel #2
0
def config_hybrid_mem(options, system):
    """
    Assign proper address ranges for DRAM and NVM controllers.
    Create memory controllers and add their shared bus to the system.
    """
    system.thnvm_bus = VirtualXBar()
    mem_ctrls = []

    # The default behaviour is to interleave memory channels on 128
    # byte granularity, or cache line granularity if larger than 128
    # byte. This value is based on the locality seen across a large
    # range of workloads.
    intlv_size = max(128, system.cache_line_size.value)

    total_size = Addr(options.mem_size)
    dram_size = pow(2, options.page_bits) * options.ptt_length

    if dram_size < total_size.value:
        nvm_cls = MemConfig.get(options.nvm_type)
        nvm_range = AddrRange(0, total_size - dram_size - 1)
        nvm_ctrl = MemConfig.create_mem_ctrl(nvm_cls, nvm_range,
                                             0, 1, 0, intlv_size)
        # Set the number of ranks based on the command-line
        # options if it was explicitly set
        if issubclass(nvm_cls, DRAMCtrl) and options.mem_ranks:
            nvm_ctrl.ranks_per_channel = options.mem_ranks

        mem_ctrls.append(nvm_ctrl)

    if dram_size > 0:
        dram_cls = MemConfig.get(options.dram_type)
        dram_range = AddrRange(total_size - dram_size, total_size - 1)
        dram_ctrl = MemConfig.create_mem_ctrl(dram_cls, dram_range,
                                              0, 1, 0, intlv_size)
        # Set the number of ranks based on the command-line
        # options if it was explicitly set
        if issubclass(dram_cls, DRAMCtrl) and options.mem_ranks:
            dram_ctrl.ranks_per_channel = options.mem_ranks

        mem_ctrls.append(dram_ctrl)

    system.mem_ctrls = mem_ctrls

    # Connect the controllers to the THNVM bus
    for i in xrange(len(system.mem_ctrls)):
        system.mem_ctrls[i].port = system.thnvm_bus.master

    system.thnvm_bus.slave = system.membus.master
Beispiel #3
0
def setup_memory_controllers(system, ruby, dir_cntrls, options):
    ruby.block_size_bytes = options.cacheline_size
    ruby.memory_size_bits = 48
    block_size_bits = int(math.log(options.cacheline_size, 2))

    if options.numa_high_bit:
        numa_bit = options.numa_high_bit
    else:
        # if the numa_bit is not specified, set the directory bits as the
        # lowest bits above the block offset bits, and the numa_bit as the
        # highest of those directory bits
        dir_bits = int(math.log(options.num_dirs, 2))
        numa_bit = block_size_bits + dir_bits - 1

    index = 0
    mem_ctrls = []
    crossbars = []

    # Sets bits to be used for interleaving.  Creates memory controllers
    # attached to a directory controller.  A separate controller is created
    # for each address range as the abstract memory can handle only one
    # contiguous address range as of now.
    for dir_cntrl in dir_cntrls:
        dir_cntrl.directory.numa_high_bit = numa_bit

        crossbar = None
        if buildEnv['TARGET_ISA'] != "arm":
            if len(system.mem_ranges) > 1:
                crossbar = IOXBar()
                crossbars.append(crossbar)
                dir_cntrl.memory = crossbar.slave
        else:
            #connect to iobus crossbar
            dir_cntrl.memory = system.iobus.slave

        for r in system.mem_ranges:
            mem_ctrl = MemConfig.create_mem_ctrl(
                MemConfig.get(options.mem_type), r, index, options.num_dirs,
                int(math.log(options.num_dirs, 2)), options.cacheline_size)

            mem_ctrls.append(mem_ctrl)

            if buildEnv['TARGET_ISA'] != "arm":
                if crossbar != None:
                    mem_ctrl.port = crossbar.master
                else:
                    mem_ctrl.port = dir_cntrl.memory
            else:
                #ARM
                mem_ctrl.port = system.iobus.master

        index += 1

    system.mem_ctrls = mem_ctrls

    if buildEnv['TARGET_ISA'] != "arm":
        if len(crossbars) > 0:
            ruby.crossbars = crossbars
Beispiel #4
0
def jiwon_config_pim( test_sys, options ):
    (TestCPUClass, test_mem_mode, FutureClass) = Simulation.setCPUClass(options)

    # create PIM vault ctrls
    cls = MemConfig.get(options.pim_mem_type)
    pim_vault_ctrls = []
    for i in xrange(options.num_pim_sys):
        vault_range = AddrRange(PIM_VAULT_BASE_ADDR + i*DRAM_CHANNEL_SIZE_INT, size=DRAM_CHANNEL_SIZE_MB)
        pim_vault = ethz_create_mem_ctrl(cls, vault_range, i=0, nbr_mem_ctrls=1, intlv_bits=0, cache_line_size=test_sys.cache_line_size.value) 
        pim_vault_ctrls.append(pim_vault)

    test_sys.pim_vault_ctrls = pim_vault_ctrls

    # create PIM cores
    test_sys.pim_sys = [build_pim_system(options, test_mem_mode, TestCPUClass, pim_id=i) for i in xrange(options.num_pim_sys)]
    for i in xrange(options.num_pim_sys):
        pim_device_range = AddrRange(PIM_ADDRESS_BASE + i*PIM_ADDRESS_SIZE_INT, size = PIM_ADDRESS_SIZE) #physical address range
        pim_vault_phys_addr_base = PIM_VAULT_BASE_ADDR + i*DRAM_CHANNEL_SIZE_INT
        pim_vault_range = AddrRange(pim_vault_phys_addr_base, size=DRAM_CHANNEL_SIZE_MB) #physical address range
        test_sys.pim_sys[i].p2s = Bridge(ranges=pim_vault_range, delay='0.01ns', req_size=32, resp_size=32)
        test_sys.pim_sys[i].s2p = Bridge(ranges=pim_device_range, delay='0.01ns', req_size=32, resp_size=32)

        # add vault address ranges to TLB
        pim_vault_virt_addr_base = 0xC0000000  #JIWON: TODO-- need to change this to something more reasonable....
        test_sys.pim_sys[i].itlb.original_ranges.append(AddrRange(pim_vault_virt_addr_base, size=DRAM_CHANNEL_SIZE_MB))
        test_sys.pim_sys[i].itlb.remapped_ranges.append(AddrRange(pim_vault_phys_addr_base, size=DRAM_CHANNEL_SIZE_MB))
        test_sys.pim_sys[i].dtlb.original_ranges.append(AddrRange(pim_vault_virt_addr_base, size=DRAM_CHANNEL_SIZE_MB))
        test_sys.pim_sys[i].dtlb.remapped_ranges.append(AddrRange(pim_vault_phys_addr_base, size=DRAM_CHANNEL_SIZE_MB))
        test_sys.pim_sys[i].stlb.original_ranges.append(AddrRange(pim_vault_virt_addr_base, size=DRAM_CHANNEL_SIZE_MB))
        test_sys.pim_sys[i].stlb.remapped_ranges.append(AddrRange(pim_vault_phys_addr_base, size=DRAM_CHANNEL_SIZE_MB))

        if ( GEM5_ENABLE_COMM_MONITORS == "TRUE" ):
            test_sys.pim_sys[i].Smon = CommMonitor()

            if ( SMON_DUMP_ADDRESS == "TRUE" ):
                test_sys.pim_sys[i].Smon.dump_addresses=True
                test_sys.pim_sys[i].Smon.dump_file="m5out/smon_addr_dump.txt"

            test_sys.pim_sys[i].pimbus.master = test_sys.pim_sys[i].Smon.slave
            test_sys.pim_sys[i].Smon.master = test_sys.pim_sys[i].p2s.slave
        else:
            test_sys.pim_sys[i].pimbus.master = test_sys.pim_sys[i].p2s.slave

        test_sys.pim_sys[i].s2p.master = test_sys.pim_sys[i].pimbus.slave
        if ( MOVE_PIM_TO_HOST == "FALSE" ):
            test_sys.smcxbar.master = test_sys.pim_sys[i].s2p.slave # connect PIM core to system
            test_sys.pim_vault_ctrls[i].port = test_sys.pim_sys[i].p2s.master # connect PIM vault to PIM core
        else:
            test_sys.pim_sys[i].p2s.master = test_sys.membus.slave
            test_sys.membus.master = test_sys.pim_sys[i].s2p.slave;
Beispiel #5
0
def setup_memory_controllers(system, ruby, dir_cntrls, options):
    ruby.block_size_bytes = options.cacheline_size
    ruby.memory_size_bits = 48
    block_size_bits = int(math.log(options.cacheline_size, 2))

    if options.numa_high_bit:
        numa_bit = options.numa_high_bit
    else:
        # if the numa_bit is not specified, set the directory bits as the
        # lowest bits above the block offset bits, and the numa_bit as the
        # highest of those directory bits
        dir_bits = int(math.log(options.num_dirs, 2))
        numa_bit = block_size_bits + dir_bits - 1

    index = 0
    mem_ctrls = []
    crossbars = []

    # Sets bits to be used for interleaving.  Creates memory controllers
    # attached to a directory controller.  A separate controller is created
    # for each address range as the abstract memory can handle only one
    # contiguous address range as of now.
    for dir_cntrl in dir_cntrls:
        dir_cntrl.directory.numa_high_bit = numa_bit

        crossbar = None
        if len(system.mem_ranges) > 1:
            crossbar = NoncoherentXBar()
            crossbars.append(crossbar)
            dir_cntrl.memory = crossbar.slave

        for r in system.mem_ranges:
            mem_ctrl = MemConfig.create_mem_ctrl(
                MemConfig.get(options.mem_type), r, index, options.num_dirs,
                int(math.log(options.num_dirs, 2)), options.cacheline_size)

            mem_ctrls.append(mem_ctrl)

            if crossbar != None:
                mem_ctrl.port = crossbar.master
            else:
                mem_ctrl.port = dir_cntrl.memory

        index += 1

    system.mem_ctrls = mem_ctrls

    if len(crossbars) > 0:
        ruby.crossbars = crossbars
Beispiel #6
0
def ethz_config_mem(options, system):
    """
    Create the memory controllers based on the options and attach them.

    If requested, we make a multi-channel configuration of the
    selected memory controller class by creating multiple instances of
    the specific class. The individual controllers have their
    parameters set such that the address range is interleaved between
    them.
    """
    nbr_mem_ctrls = options.mem_channels
    import math
    from m5.util import fatal
    intlv_bits = int(math.log(nbr_mem_ctrls, 2))
    if 2**intlv_bits != nbr_mem_ctrls:
        fatal("Number of memory channels must be a power of 2")

    cls = MemConfig.get(options.mem_type)
    mem_ctrls = []

    # For every range (most systems will only have one), create an
    # array of controllers and set their parameters to match their
    # address mapping in the case of a DRAM
    for r in system.mem_ranges:
        for i in xrange(nbr_mem_ctrls):
            mem_ctrls.append(
                ethz_create_mem_ctrl(cls, r, i, nbr_mem_ctrls, intlv_bits,
                                     system.cache_line_size.value))

    system.mem_ctrls = mem_ctrls

    # Connect the controllers to the smcxbar
    for i in xrange(len(system.mem_ctrls)):
        system.mem_ctrls[i].port = system.smcxbar.master  # Modified by Erfan

    ethz_print_val("system.cache_line_size.value (bytes)",
                   system.cache_line_size.value)

    if (options.mem_type != "ethz_ModelsimIF"):
        ethz_print_val("number of vaults", nbr_mem_ctrls)
Beispiel #7
0
def setMemClass(options):
    """Returns a memory controller class."""

    return MemConfig.get(options.mem_type)
Beispiel #8
0
def setMemClass(options):
    """Returns a memory controller class."""

    return MemConfig.get(options.mem_type)
Beispiel #9
0
def setup_memory_controllers(system, ruby, dir_cntrls, options):
    ruby.block_size_bytes = options.cacheline_size
    ruby.memory_size_bits = 48
    block_size_bits = int(math.log(options.cacheline_size, 2))

    if options.numa_high_bit:
        numa_bit = options.numa_high_bit
    else:
        # if the numa_bit is not specified, set the directory bits as the
        # lowest bits above the block offset bits, and the numa_bit as the
        # highest of those directory bits
        dir_bits = int(math.log(options.num_dirs, 2))
        numa_bit = block_size_bits + dir_bits - 1

    index = 0
    mem_ctrls = []
    crossbars = []

    # Sets bits to be used for interleaving.  Creates memory controllers
    # attached to a directory controller.  A separate controller is created
    # for each address range as the abstract memory can handle only one
    # contiguous address range as of now.
    for dir_cntrl in dir_cntrls:
        # Create 1 instance of DRAMCache per directory controller
        if options.dramcache:
            dramcache_ctrl = MemConfig.create_dramcache_ctrl(
                MemConfig.get_cache(options.dramcache_type),
                system.mem_ranges[0], index, options.num_dirs,
                options.dramcache_size, options.dramcache_assoc,
                options.dramcache_block_size, options.num_cpus,
                options.dramcache_timing)

            mem_ctrls.append(dramcache_ctrl)
            dir_cntrl.memory = dramcache_ctrl.port

        dir_cntrl.directory.numa_high_bit = numa_bit

        crossbar = None
        if len(system.mem_ranges) > 1:
            # we dont support this
            fatal("system mem_ranges greater than 1")
            crossbar = IOXBar()
            crossbars.append(crossbar)
            if options.dramcache:
                dramcache_ctrl.dramcache_masterport = crossbar.slave
            else:
                dir_cntrl.memory = crossbar.slave

        for r in system.mem_ranges:
            # if dramcache exists interleave at dramcache_block_size
            if options.dramcache:
                mem_ctrl = MemConfig.create_mem_ctrl(
                    MemConfig.get(options.mem_type),
                    r, index, options.num_dirs,
                    int(math.log(options.num_dirs,
                                 2)), options.dramcache_block_size)
            else:
                mem_ctrl = MemConfig.create_mem_ctrl(
                    MemConfig.get(options.mem_type),
                    r, index, options.num_dirs,
                    int(math.log(options.num_dirs, 2)), options.cacheline_size)

            mem_ctrls.append(mem_ctrl)

            if crossbar != None:
                mem_ctrl.port = crossbar.master
            else:
                if options.dramcache:
                    mem_ctrl.port = dramcache_ctrl.dramcache_masterport
                else:
                    mem_ctrl.port = dir_cntrl.memory

        index += 1

    system.mem_ctrls = mem_ctrls

    if len(crossbars) > 0:
        ruby.crossbars = crossbars
Beispiel #10
0
def create_system(options, full_system, system, dma_devices, ruby_system):

    if not buildEnv['GPGPU_SIM']:
        m5.util.panic(
            "This script requires GPGPU-Sim integration to be built.")

    # Run the protocol script to setup CPU cluster, directory and DMA
    (all_sequencers, dir_cntrls, dma_cntrls, cpu_cluster) = \
                                        VI_hammer.create_system(options,
                                                                full_system,
                                                                system,
                                                                dma_devices,
                                                                ruby_system)

    # If we're going to split the directories/memory controllers
    if options.num_dev_dirs > 0:
        cpu_cntrl_count = len(cpu_cluster)
    else:
        cpu_cntrl_count = len(cpu_cluster) + len(dir_cntrls)

    #
    # Create controller for the copy engine to connect to in CPU cluster
    # Cache is unused by controller
    #
    cache = L1Cache(size="4096B", assoc=2)

    cpu_ce_seq = RubySequencer(version=options.num_cpus + options.num_sc,
                               icache=cache,
                               dcache=cache,
                               max_outstanding_requests=64,
                               ruby_system=ruby_system,
                               connect_to_io=False)

    cpu_ce_cntrl = GPUCopyDMA_Controller(version=0,
                                         sequencer=cpu_ce_seq,
                                         number_of_TBEs=256,
                                         transitions_per_cycle=options.ports,
                                         ruby_system=ruby_system)

    cpu_ce_cntrl.responseFromDir = MessageBuffer(ordered=True)
    cpu_ce_cntrl.responseFromDir.slave = ruby_system.network.master
    cpu_ce_cntrl.reqToDirectory = MessageBuffer(ordered=True)
    cpu_ce_cntrl.reqToDirectory.master = ruby_system.network.slave

    cpu_ce_cntrl.mandatoryQueue = MessageBuffer()

    ruby_system.ce_cntrl = cpu_ce_cntrl

    cpu_cntrl_count += 1

    #
    # Build GPU cluster
    #
    gpu_cluster = Cluster(intBW=32, extBW=32)
    gpu_cluster.disableConnectToParent()

    l2_bits = int(math.log(options.num_l2caches, 2))
    block_size_bits = int(math.log(options.cacheline_size, 2))
    # This represents the L1 to L2 interconnect latency
    # NOTE! This latency is in Ruby (cache) cycles, not SM cycles
    per_hop_interconnect_latency = 45  # ~15 GPU cycles
    num_dance_hall_hops = int(math.log(options.num_sc, 2))
    if num_dance_hall_hops == 0:
        num_dance_hall_hops = 1
    l1_to_l2_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops

    #
    # Caches for GPU cores
    #
    for i in xrange(options.num_sc):
        #
        # First create the Ruby objects associated with the GPU cores
        #
        cache = L1Cache(size=options.sc_l1_size,
                        assoc=options.sc_l1_assoc,
                        replacement_policy=LRUReplacementPolicy(),
                        start_index_bit=block_size_bits,
                        dataArrayBanks=4,
                        tagArrayBanks=4,
                        dataAccessLatency=4,
                        tagAccessLatency=4,
                        resourceStalls=False)

        l1_cntrl = GPUL1Cache_Controller(
            version=i,
            cache=cache,
            l2_select_num_bits=l2_bits,
            num_l2=options.num_l2caches,
            transitions_per_cycle=options.ports,
            issue_latency=l1_to_l2_noc_latency,
            number_of_TBEs=options.gpu_l1_buf_depth,
            ruby_system=ruby_system)

        gpu_seq = RubySequencer(
            version=options.num_cpus + i,
            icache=cache,
            dcache=cache,
            max_outstanding_requests=options.gpu_l1_buf_depth,
            ruby_system=ruby_system,
            deadlock_threshold=2000000,
            connect_to_io=False)

        l1_cntrl.sequencer = gpu_seq

        exec("ruby_system.l1_cntrl_sp%02d = l1_cntrl" % i)

        #
        # Add controllers and sequencers to the appropriate lists
        #
        all_sequencers.append(gpu_seq)
        gpu_cluster.add(l1_cntrl)

        # Connect the controller to the network
        l1_cntrl.requestFromL1Cache = MessageBuffer(ordered=True)
        l1_cntrl.requestFromL1Cache.master = ruby_system.network.slave
        l1_cntrl.responseToL1Cache = MessageBuffer(ordered=True)
        l1_cntrl.responseToL1Cache.slave = ruby_system.network.master

        l1_cntrl.mandatoryQueue = MessageBuffer()

    l2_index_start = block_size_bits + l2_bits
    # Use L2 cache and interconnect latencies to calculate protocol latencies
    # NOTE! These latencies are in Ruby (cache) cycles, not SM cycles
    l2_cache_access_latency = 30  # ~10 GPU cycles
    l2_to_l1_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops
    l2_to_mem_noc_latency = 125  # ~40 GPU cycles

    l2_clusters = []
    for i in xrange(options.num_l2caches):
        #
        # First create the Ruby objects associated with this cpu
        #
        l2_cache = L2Cache(size=options.sc_l2_size,
                           assoc=options.sc_l2_assoc,
                           start_index_bit=l2_index_start,
                           replacement_policy=LRUReplacementPolicy(),
                           dataArrayBanks=4,
                           tagArrayBanks=4,
                           dataAccessLatency=4,
                           tagAccessLatency=4,
                           resourceStalls=options.gpu_l2_resource_stalls)

        l2_cntrl = GPUL2Cache_Controller(
            version=i,
            L2cache=l2_cache,
            transitions_per_cycle=options.ports,
            l2_response_latency=l2_cache_access_latency + l2_to_l1_noc_latency,
            l2_request_latency=l2_to_mem_noc_latency,
            cache_response_latency=l2_cache_access_latency,
            ruby_system=ruby_system)

        exec("ruby_system.l2_cntrl%d = l2_cntrl" % i)
        l2_cluster = Cluster(intBW=32, extBW=32)
        l2_cluster.add(l2_cntrl)
        gpu_cluster.add(l2_cluster)
        l2_clusters.append(l2_cluster)

        # Connect the controller to the network
        l2_cntrl.responseToL1Cache = MessageBuffer(ordered=True)
        l2_cntrl.responseToL1Cache.master = ruby_system.network.slave
        l2_cntrl.requestFromCache = MessageBuffer()
        l2_cntrl.requestFromCache.master = ruby_system.network.slave
        l2_cntrl.responseFromCache = MessageBuffer()
        l2_cntrl.responseFromCache.master = ruby_system.network.slave
        l2_cntrl.unblockFromCache = MessageBuffer()
        l2_cntrl.unblockFromCache.master = ruby_system.network.slave

        l2_cntrl.requestFromL1Cache = MessageBuffer(ordered=True)
        l2_cntrl.requestFromL1Cache.slave = ruby_system.network.master
        l2_cntrl.forwardToCache = MessageBuffer()
        l2_cntrl.forwardToCache.slave = ruby_system.network.master
        l2_cntrl.responseToCache = MessageBuffer()
        l2_cntrl.responseToCache.slave = ruby_system.network.master

        l2_cntrl.triggerQueue = MessageBuffer()

    gpu_phys_mem_size = system.gpu.gpu_memory_range.size()

    if options.num_dev_dirs > 0:
        mem_module_size = gpu_phys_mem_size / options.num_dev_dirs

        #
        # determine size and index bits for probe filter
        # By default, the probe filter size is configured to be twice the
        # size of the L2 cache.
        #
        pf_size = MemorySize(options.sc_l2_size)
        pf_size.value = pf_size.value * 2
        dir_bits = int(math.log(options.num_dev_dirs, 2))
        pf_bits = int(math.log(pf_size.value, 2))
        if options.numa_high_bit:
            if options.pf_on or options.dir_on:
                # if numa high bit explicitly set, make sure it does not overlap
                # with the probe filter index
                assert (options.numa_high_bit - dir_bits > pf_bits)

            # set the probe filter start bit to just above the block offset
            pf_start_bit = block_size_bits
        else:
            if dir_bits > 0:
                pf_start_bit = dir_bits + block_size_bits - 1
            else:
                pf_start_bit = block_size_bits

        dev_dir_cntrls = []
        dev_mem_ctrls = []
        num_cpu_dirs = len(dir_cntrls)
        for i in xrange(options.num_dev_dirs):
            #
            # Create the Ruby objects associated with the directory controller
            #

            dir_version = i + num_cpu_dirs

            dir_size = MemorySize('0B')
            dir_size.value = mem_module_size

            pf = ProbeFilter(size=pf_size,
                             assoc=4,
                             start_index_bit=pf_start_bit)

            dev_dir_cntrl = Directory_Controller(version = dir_version,
                                 directory = \
                                 RubyDirectoryMemory( \
                                            version = dir_version,
                                            size = dir_size,
                                            numa_high_bit = \
                                            options.numa_high_bit,
                                            device_directory = True),
                                 probeFilter = pf,
                                 probe_filter_enabled = options.pf_on,
                                 full_bit_dir_enabled = options.dir_on,
                                 transitions_per_cycle = options.ports,
                                 ruby_system = ruby_system)

            if options.recycle_latency:
                dev_dir_cntrl.recycle_latency = options.recycle_latency

            exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i)
            dev_dir_cntrls.append(dev_dir_cntrl)

            # Connect the directory controller to the network
            dev_dir_cntrl.forwardFromDir = MessageBuffer()
            dev_dir_cntrl.forwardFromDir.master = ruby_system.network.slave
            dev_dir_cntrl.responseFromDir = MessageBuffer()
            dev_dir_cntrl.responseFromDir.master = ruby_system.network.slave
            dev_dir_cntrl.dmaResponseFromDir = MessageBuffer(ordered=True)
            dev_dir_cntrl.dmaResponseFromDir.master = ruby_system.network.slave

            dev_dir_cntrl.unblockToDir = MessageBuffer()
            dev_dir_cntrl.unblockToDir.slave = ruby_system.network.master
            dev_dir_cntrl.responseToDir = MessageBuffer()
            dev_dir_cntrl.responseToDir.slave = ruby_system.network.master
            dev_dir_cntrl.requestToDir = MessageBuffer()
            dev_dir_cntrl.requestToDir.slave = ruby_system.network.master
            dev_dir_cntrl.dmaRequestToDir = MessageBuffer(ordered=True)
            dev_dir_cntrl.dmaRequestToDir.slave = ruby_system.network.master

            dev_dir_cntrl.triggerQueue = MessageBuffer(ordered=True)
            dev_dir_cntrl.responseFromMemory = MessageBuffer()

            dev_mem_ctrl = MemConfig.create_mem_ctrl(
                MemConfig.get(options.mem_type),
                system.gpu.gpu_memory_range, i, options.num_dev_dirs,
                int(math.log(options.num_dev_dirs, 2)), options.cacheline_size)
            dev_mem_ctrl.port = dev_dir_cntrl.memory
            dev_mem_ctrls.append(dev_mem_ctrl)

        system.dev_mem_ctrls = dev_mem_ctrls
    else:
        # Since there are no device directories, use CPU directories
        # Fix up the memory sizes of the CPU directories
        num_dirs = len(dir_cntrls)
        add_gpu_mem = gpu_phys_mem_size / num_dirs
        for cntrl in dir_cntrls:
            new_size = cntrl.directory.size.value + add_gpu_mem
            cntrl.directory.size.value = new_size

    #
    # Create controller for the copy engine to connect to in GPU cluster
    # Cache is unused by controller
    #
    cache = L1Cache(size="4096B", assoc=2)

    gpu_ce_seq = RubySequencer(version=options.num_cpus + options.num_sc + 1,
                               icache=cache,
                               dcache=cache,
                               max_outstanding_requests=64,
                               support_inst_reqs=False,
                               ruby_system=ruby_system,
                               connect_to_io=False)

    gpu_ce_cntrl = GPUCopyDMA_Controller(version=1,
                                         sequencer=gpu_ce_seq,
                                         number_of_TBEs=256,
                                         transitions_per_cycle=options.ports,
                                         ruby_system=ruby_system)

    ruby_system.dev_ce_cntrl = gpu_ce_cntrl

    all_sequencers.append(cpu_ce_seq)
    all_sequencers.append(gpu_ce_seq)

    gpu_ce_cntrl.responseFromDir = MessageBuffer(ordered=True)
    gpu_ce_cntrl.responseFromDir.slave = ruby_system.network.master
    gpu_ce_cntrl.reqToDirectory = MessageBuffer(ordered=True)
    gpu_ce_cntrl.reqToDirectory.master = ruby_system.network.slave

    gpu_ce_cntrl.mandatoryQueue = MessageBuffer()

    complete_cluster = Cluster(intBW=32, extBW=32)
    complete_cluster.add(cpu_ce_cntrl)
    complete_cluster.add(gpu_ce_cntrl)
    complete_cluster.add(cpu_cluster)
    complete_cluster.add(gpu_cluster)

    for cntrl in dir_cntrls:
        complete_cluster.add(cntrl)

    for cntrl in dev_dir_cntrls:
        complete_cluster.add(cntrl)

    for cntrl in dma_cntrls:
        complete_cluster.add(cntrl)

    for cluster in l2_clusters:
        complete_cluster.add(cluster)

    return (all_sequencers, dir_cntrls, complete_cluster)
def create_system(options, full_system, system, dma_devices, ruby_system):

    if not buildEnv['GPGPU_SIM']:
        m5.util.panic("This script requires GPGPU-Sim integration to be built.")

    # Run the protocol script to setup CPU cluster, directory and DMA
    (all_sequencers, dir_cntrls, dma_cntrls, cpu_cluster) = \
                                        VI_hammer.create_system(options,
                                                                full_system,
                                                                system,
                                                                dma_devices,
                                                                ruby_system)

    # If we're going to split the directories/memory controllers
    if options.num_dev_dirs > 0:
        cpu_cntrl_count = len(cpu_cluster)
    else:
        cpu_cntrl_count = len(cpu_cluster) + len(dir_cntrls)

    #
    # Create controller for the copy engine to connect to in CPU cluster
    # Cache is unused by controller
    #
    cache = L1Cache(size = "4096B", assoc = 2)

    cpu_ce_seq = RubySequencer(version = options.num_cpus + options.num_sc,
                               icache = cache,
                               dcache = cache,
                               max_outstanding_requests = 64,
                               ruby_system = ruby_system,
                               connect_to_io = False)

    cpu_ce_cntrl = GPUCopyDMA_Controller(version = 0,
                                         sequencer = cpu_ce_seq,
                                         number_of_TBEs = 256,
                                         ruby_system = ruby_system)

    cpu_cntrl_count += 1

    cpu_ce_cntrl.responseFromDir = ruby_system.network.master
    cpu_ce_cntrl.reqToDirectory = ruby_system.network.slave

    #
    # Build GPU cluster
    #
    gpu_cluster = Cluster(intBW = 32, extBW = 32)
    gpu_cluster.disableConnectToParent()

    l2_bits = int(math.log(options.num_l2caches, 2))
    block_size_bits = int(math.log(options.cacheline_size, 2))
    # This represents the L1 to L2 interconnect latency
    # NOTE! This latency is in Ruby (cache) cycles, not SM cycles
    per_hop_interconnect_latency = 45 # ~15 GPU cycles
    num_dance_hall_hops = int(math.log(options.num_sc, 2))
    if num_dance_hall_hops == 0:
        num_dance_hall_hops = 1
    l1_to_l2_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops

    #
    # Caches for GPU cores
    #
    for i in xrange(options.num_sc):
        #
        # First create the Ruby objects associated with the GPU cores
        #
        cache = L1Cache(size = options.sc_l1_size,
                            assoc = options.sc_l1_assoc,
                            replacement_policy = "LRU",
                            start_index_bit = block_size_bits,
                            dataArrayBanks = 4,
                            tagArrayBanks = 4,
                            dataAccessLatency = 4,
                            tagAccessLatency = 4,
                            resourceStalls = False)

        l1_cntrl = GPUL1Cache_Controller(version = i,
                                  cache = cache,
                                  l2_select_num_bits = l2_bits,
                                  num_l2 = options.num_l2caches,
                                  issue_latency = l1_to_l2_noc_latency,
                                  number_of_TBEs = options.gpu_l1_buf_depth,
                                  ruby_system = ruby_system)

        gpu_seq = RubySequencer(version = options.num_cpus + i,
                            icache = cache,
                            dcache = cache,
                            max_outstanding_requests = options.gpu_l1_buf_depth,
                            ruby_system = ruby_system,
                            deadlock_threshold = 2000000,
                            connect_to_io = False)

        l1_cntrl.sequencer = gpu_seq

        exec("ruby_system.l1_cntrl_sp%02d = l1_cntrl" % i)

        #
        # Add controllers and sequencers to the appropriate lists
        #
        all_sequencers.append(gpu_seq)
        gpu_cluster.add(l1_cntrl)

        # Connect the controller to the network
        l1_cntrl.requestFromL1Cache = ruby_system.network.slave
        l1_cntrl.responseToL1Cache = ruby_system.network.master

    l2_index_start = block_size_bits + l2_bits
    # Use L2 cache and interconnect latencies to calculate protocol latencies
    # NOTE! These latencies are in Ruby (cache) cycles, not SM cycles
    l2_cache_access_latency = 30 # ~10 GPU cycles
    l2_to_l1_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops
    l2_to_mem_noc_latency = 125 # ~40 GPU cycles

    l2_clusters = []
    for i in xrange(options.num_l2caches):
        #
        # First create the Ruby objects associated with this cpu
        #
        l2_cache = L2Cache(size = options.sc_l2_size,
                           assoc = options.sc_l2_assoc,
                           start_index_bit = l2_index_start,
                           replacement_policy = "LRU",
                           dataArrayBanks = 4,
                           tagArrayBanks = 4,
                           dataAccessLatency = 4,
                           tagAccessLatency = 4,
                           resourceStalls = options.gpu_l2_resource_stalls)

	region_buffer = regionBuffer_Obj(size = "8MB",
                           assoc = 2^16,
                           start_index_bit = l2_index_start,
                           replacement_policy = "LRU",
                           dataArrayBanks = 4,
                           tagArrayBanks = 4,
                           dataAccessLatency = 4,
                           tagAccessLatency = 4,
                           resourceStalls = options.gpu_l2_resource_stalls,
 			   regionSize = options.region_size)



        l2_cntrl = GPUL2Cache_Controller(version = i,
                                L2cache = l2_cache,
				regionBuffer = region_buffer,
                                l2_response_latency = l2_cache_access_latency +
                                                      l2_to_l1_noc_latency,
                                l2_request_latency = l2_to_mem_noc_latency,
                                cache_response_latency = l2_cache_access_latency,
                                ruby_system = ruby_system)

        exec("ruby_system.l2_cntrl%d = l2_cntrl" % i)
        l2_cluster = Cluster(intBW = 32, extBW = 32)
        l2_cluster.add(l2_cntrl)
        gpu_cluster.add(l2_cluster)
        l2_clusters.append(l2_cluster)

        # Connect the controller to the network
        l2_cntrl.responseToL1Cache = ruby_system.network.slave
        l2_cntrl.requestFromCache = ruby_system.network.slave
        l2_cntrl.responseFromCache = ruby_system.network.slave
        l2_cntrl.unblockFromCache = ruby_system.network.slave

        l2_cntrl.requestFromL1Cache = ruby_system.network.master
        l2_cntrl.forwardToCache = ruby_system.network.master
        l2_cntrl.responseToCache = ruby_system.network.master

    gpu_phys_mem_size = system.gpu.gpu_memory_range.size()

    if options.num_dev_dirs > 0:
        mem_module_size = gpu_phys_mem_size / options.num_dev_dirs

        #
        # determine size and index bits for probe filter
        # By default, the probe filter size is configured to be twice the
        # size of the L2 cache.
        #
        pf_size = MemorySize(options.sc_l2_size)
        pf_size.value = pf_size.value * 2
        dir_bits = int(math.log(options.num_dev_dirs, 2))
        pf_bits = int(math.log(pf_size.value, 2))
        if options.numa_high_bit:
            if options.pf_on or options.dir_on:
                # if numa high bit explicitly set, make sure it does not overlap
                # with the probe filter index
                assert(options.numa_high_bit - dir_bits > pf_bits)

            # set the probe filter start bit to just above the block offset
            pf_start_bit = block_size_bits
        else:
            if dir_bits > 0:
                pf_start_bit = dir_bits + block_size_bits - 1
            else:
                pf_start_bit = block_size_bits

        dev_dir_cntrls = []
        dev_mem_ctrls = []
        num_cpu_dirs = len(dir_cntrls)
        for i in xrange(options.num_dev_dirs):
            #
            # Create the Ruby objects associated with the directory controller
            #

            dir_version = i + num_cpu_dirs

            dir_size = MemorySize('0B')
            dir_size.value = mem_module_size

            pf = ProbeFilter(size = pf_size, assoc = 4,
                             start_index_bit = pf_start_bit)

            dev_dir_cntrl = Directory_Controller(version = dir_version,
                                 directory = \
                                 RubyDirectoryMemory( \
                                            version = dir_version,
                                            size = dir_size,
                                            numa_high_bit = \
                                            options.numa_high_bit,
                                            device_directory = True),
                                 probeFilter = pf,
                                 probe_filter_enabled = options.pf_on,
                                 full_bit_dir_enabled = options.dir_on,
                                 ruby_system = ruby_system)

            if options.recycle_latency:
                dev_dir_cntrl.recycle_latency = options.recycle_latency

            exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i)
            dev_dir_cntrls.append(dev_dir_cntrl)

            # Connect the directory controller to the network
            dev_dir_cntrl.forwardFromDir = ruby_system.network.slave
            dev_dir_cntrl.responseFromDir = ruby_system.network.slave
            dev_dir_cntrl.dmaResponseFromDir = ruby_system.network.slave

            dev_dir_cntrl.unblockToDir = ruby_system.network.master
            dev_dir_cntrl.responseToDir = ruby_system.network.master
            dev_dir_cntrl.requestToDir = ruby_system.network.master
            dev_dir_cntrl.dmaRequestToDir = ruby_system.network.master

            dev_mem_ctrl = MemConfig.create_mem_ctrl(
                MemConfig.get(options.mem_type), system.gpu.gpu_memory_range,
                i, options.num_dev_dirs, int(math.log(options.num_dev_dirs, 2)),
                options.cacheline_size)
            dev_mem_ctrl.port = dev_dir_cntrl.memory
            dev_mem_ctrls.append(dev_mem_ctrl)

        system.dev_mem_ctrls = dev_mem_ctrls
    else:
        # Since there are no device directories, use CPU directories
        # Fix up the memory sizes of the CPU directories
        num_dirs = len(dir_cntrls)
        add_gpu_mem = gpu_phys_mem_size / num_dirs
        for cntrl in dir_cntrls:
            new_size = cntrl.directory.size.value + add_gpu_mem
            cntrl.directory.size.value = new_size

    #
    # Create controller for the copy engine to connect to in GPU cluster
    # Cache is unused by controller
    #
    cache = L1Cache(size = "4096B", assoc = 2)

    gpu_ce_seq = RubySequencer(version = options.num_cpus + options.num_sc + 1,
                               icache = cache,
                               dcache = cache,
                               max_outstanding_requests = 64,
                               support_inst_reqs = False,
                               ruby_system = ruby_system,
                               connect_to_io = False)

    gpu_ce_cntrl = GPUCopyDMA_Controller(version = 1,
                                  sequencer = gpu_ce_seq,
                                  number_of_TBEs = 256,
                                  ruby_system = ruby_system)

    ruby_system.l1_cntrl_ce = gpu_ce_cntrl

    all_sequencers.append(cpu_ce_seq)
    all_sequencers.append(gpu_ce_seq)

    gpu_ce_cntrl.responseFromDir = ruby_system.network.master
    gpu_ce_cntrl.reqToDirectory = ruby_system.network.slave

    complete_cluster = Cluster(intBW = 32, extBW = 32)
    complete_cluster.add(cpu_ce_cntrl)
    complete_cluster.add(gpu_ce_cntrl)
    complete_cluster.add(cpu_cluster)
    complete_cluster.add(gpu_cluster)

    for cntrl in dir_cntrls:
        complete_cluster.add(cntrl)

    for cntrl in dev_dir_cntrls:
        complete_cluster.add(cntrl)

    for cntrl in dma_cntrls:
        complete_cluster.add(cntrl)

    for cluster in l2_clusters:
        complete_cluster.add(cluster)

    return (all_sequencers, dir_cntrls, complete_cluster)
def create_system(options, full_system, system, dma_ports, ruby_system):

    if not buildEnv['GPGPU_SIM']:
        m5.util.panic("This script requires GPGPU-Sim integration to be built.")

    options.access_backing_store = True

    # Run the original protocol script
    buildEnv['PROTOCOL'] = buildEnv['PROTOCOL'].replace('split', 'fusion')
    protocol = buildEnv['PROTOCOL']
    exec "import %s" % protocol
    try:
        (cpu_sequencers, dir_cntrl_nodes, topology) = \
            eval("%s.create_system(options, full_system, system, dma_ports, ruby_system)" % protocol)
    except:
        print "Error: could not create system for ruby protocol inside fusion system %s" % protocol
        raise

    # Faking things to build the rest of the system
    print "Warning!"
    print "Warning: Faking split MOESI_hammer protocol; collecting checkpoints?"
    print "Warning!"

    if options.num_dev_dirs > 0:
        block_size_bits = int(math.log(options.cacheline_size, 2))
        gpu_phys_mem_size = system.gpu.gpu_memory_range.size()
        mem_module_size = gpu_phys_mem_size / options.num_dev_dirs

        #
        # determine size and index bits for probe filter
        # By default, the probe filter size is configured to be twice the
        # size of the L2 cache.
        #
        pf_size = MemorySize(options.sc_l2_size)
        pf_size.value = pf_size.value * 2
        dir_bits = int(math.log(options.num_dev_dirs, 2))
        pf_bits = int(math.log(pf_size.value, 2))
        if options.numa_high_bit:
            if options.pf_on or options.dir_on:
                # if numa high bit explicitly set, make sure it does not overlap
                # with the probe filter index
                assert(options.numa_high_bit - dir_bits > pf_bits)

            # set the probe filter start bit to just above the block offset
            pf_start_bit = block_size_bits
        else:
            if dir_bits > 0:
                pf_start_bit = dir_bits + block_size_bits - 1
            else:
                pf_start_bit = block_size_bits

        dev_dir_cntrls = []
        dev_mem_ctrls = []
        num_cpu_dirs = len(dir_cntrl_nodes)
        for i in xrange(options.num_dev_dirs):
            #
            # Create the Ruby objects associated with the directory controller
            #

            dir_version = i + num_cpu_dirs

            dir_size = MemorySize('0B')
            dir_size.value = mem_module_size

            pf = ProbeFilter(size = pf_size, assoc = 4,
                             start_index_bit = pf_start_bit)

            dev_dir_cntrl = Directory_Controller(version = dir_version,
                                 directory = \
                                 RubyDirectoryMemory( \
                                            version = dir_version,
                                            size = dir_size,
                                            numa_high_bit = \
                                            options.numa_high_bit,
                                            device_directory = True),
                                 probeFilter = pf,
                                 probe_filter_enabled = options.pf_on,
                                 full_bit_dir_enabled = options.dir_on,
                                 ruby_system = ruby_system)

            if options.recycle_latency:
                dev_dir_cntrl.recycle_latency = options.recycle_latency

            exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i)
            dev_dir_cntrls.append(dev_dir_cntrl)

            # Connect the directory controller to the network
            dev_dir_cntrl.forwardFromDir = ruby_system.network.slave
            dev_dir_cntrl.responseFromDir = ruby_system.network.slave
            dev_dir_cntrl.dmaResponseFromDir = ruby_system.network.slave

            dev_dir_cntrl.unblockToDir = ruby_system.network.master
            dev_dir_cntrl.responseToDir = ruby_system.network.master
            dev_dir_cntrl.requestToDir = ruby_system.network.master
            dev_dir_cntrl.dmaRequestToDir = ruby_system.network.master

            dev_mem_ctrl = MemConfig.create_mem_ctrl(
                MemConfig.get(options.mem_type), system.gpu.gpu_memory_range,
                i, options.num_dev_dirs, int(math.log(options.num_dev_dirs, 2)),
                options.cacheline_size)
            dev_mem_ctrl.port = dev_dir_cntrl.memory
            dev_mem_ctrls.append(dev_mem_ctrl)

            topology.addController(dev_dir_cntrl)

        system.dev_mem_ctrls = dev_mem_ctrls

    #
    # Create controller for the copy engine to connect to in GPU cluster
    # Cache is unused by controller
    #
    block_size_bits = int(math.log(options.cacheline_size, 2))
    l1i_cache = L1Cache(size = "2kB", assoc = 2)
    l1d_cache = L1Cache(size = "2kB", assoc = 2)
    l2_cache = L2Cache(size = "2kB",
                        assoc = 2,
                        start_index_bit = block_size_bits)

    l1_cntrl = L1Cache_Controller(version = options.num_cpus + options.num_sc,
                                      L1Icache = l1i_cache,
                                      L1Dcache = l1d_cache,
                                      L2cache = l2_cache,
                                      no_mig_atomic = not \
                                          options.allow_atomic_migration,
                                      send_evictions = (
                                          options.cpu_type == "detailed"),
                                      ruby_system = ruby_system)

    gpu_ce_seq = RubySequencer(version = options.num_cpus + options.num_sc,
                               icache = l1i_cache,
                               dcache = l1d_cache,
                               max_outstanding_requests = 64,
                               ruby_system = ruby_system,
                               connect_to_io = False)

    l1_cntrl.sequencer = gpu_ce_seq

    ruby_system.l1_cntrl_gpuce = l1_cntrl

    cpu_sequencers.append(gpu_ce_seq)
    topology.addController(l1_cntrl)

    # Connect the L1 controller and the network
    # Connect the buffers from the controller to network
    l1_cntrl.requestFromCache = ruby_system.network.slave
    l1_cntrl.responseFromCache = ruby_system.network.slave
    l1_cntrl.unblockFromCache = ruby_system.network.slave

    # Connect the buffers from the network to the controller
    l1_cntrl.forwardToCache = ruby_system.network.master
    l1_cntrl.responseToCache = ruby_system.network.master

    return (cpu_sequencers, dir_cntrl_nodes, topology)
Beispiel #13
0
def create_system(options, full_system, system, dma_ports, ruby_system):

    if not buildEnv['GPGPU_SIM']:
        m5.util.panic(
            "This script requires GPGPU-Sim integration to be built.")

    options.access_backing_store = True

    # Run the original protocol script
    buildEnv['PROTOCOL'] = buildEnv['PROTOCOL'].replace('split', 'fusion')
    protocol = buildEnv['PROTOCOL']
    exec "import %s" % protocol
    try:
        (cpu_sequencers, dir_cntrl_nodes, topology) = \
            eval("%s.create_system(options, full_system, system, dma_ports, ruby_system)" % protocol)
    except:
        print "Error: could not create system for ruby protocol inside fusion system %s" % protocol
        raise

    # Faking things to build the rest of the system
    print "Warning!"
    print "Warning: Faking split MOESI_hammer protocol; collecting checkpoints?"
    print "Warning!"

    if options.num_dev_dirs > 0:
        block_size_bits = int(math.log(options.cacheline_size, 2))
        gpu_phys_mem_size = system.gpu.gpu_memory_range.size()
        mem_module_size = gpu_phys_mem_size / options.num_dev_dirs

        #
        # determine size and index bits for probe filter
        # By default, the probe filter size is configured to be twice the
        # size of the L2 cache.
        #
        pf_size = MemorySize(options.sc_l2_size)
        pf_size.value = pf_size.value * 2
        dir_bits = int(math.log(options.num_dev_dirs, 2))
        pf_bits = int(math.log(pf_size.value, 2))
        if options.numa_high_bit:
            if options.pf_on or options.dir_on:
                # if numa high bit explicitly set, make sure it does not overlap
                # with the probe filter index
                assert (options.numa_high_bit - dir_bits > pf_bits)

            # set the probe filter start bit to just above the block offset
            pf_start_bit = block_size_bits
        else:
            if dir_bits > 0:
                pf_start_bit = dir_bits + block_size_bits - 1
            else:
                pf_start_bit = block_size_bits

        dev_dir_cntrls = []
        dev_mem_ctrls = []
        num_cpu_dirs = len(dir_cntrl_nodes)
        for i in xrange(options.num_dev_dirs):
            #
            # Create the Ruby objects associated with the directory controller
            #

            dir_version = i + num_cpu_dirs

            dir_size = MemorySize('0B')
            dir_size.value = mem_module_size

            pf = ProbeFilter(size=pf_size,
                             assoc=4,
                             start_index_bit=pf_start_bit)

            dev_dir_cntrl = Directory_Controller(version = dir_version,
                                 directory = \
                                 RubyDirectoryMemory( \
                                            version = dir_version,
                                            size = dir_size,
                                            numa_high_bit = \
                                            options.numa_high_bit,
                                            device_directory = True),
                                 probeFilter = pf,
                                 probe_filter_enabled = options.pf_on,
                                 full_bit_dir_enabled = options.dir_on,
                                 transitions_per_cycle = options.ports,
                                 ruby_system = ruby_system)

            if options.recycle_latency:
                dev_dir_cntrl.recycle_latency = options.recycle_latency

            exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i)
            dev_dir_cntrls.append(dev_dir_cntrl)

            # Connect the directory controller to the network
            dev_dir_cntrl.forwardFromDir = MessageBuffer()
            dev_dir_cntrl.forwardFromDir.master = ruby_system.network.slave
            dev_dir_cntrl.responseFromDir = MessageBuffer()
            dev_dir_cntrl.responseFromDir.master = ruby_system.network.slave
            dev_dir_cntrl.dmaResponseFromDir = MessageBuffer(ordered=True)
            dev_dir_cntrl.dmaResponseFromDir.master = ruby_system.network.slave

            dev_dir_cntrl.triggerQueue = MessageBuffer(ordered=True)

            dev_dir_cntrl.unblockToDir = MessageBuffer()
            dev_dir_cntrl.unblockToDir.slave = ruby_system.network.master
            dev_dir_cntrl.responseToDir = MessageBuffer()
            dev_dir_cntrl.responseToDir.slave = ruby_system.network.master
            dev_dir_cntrl.requestToDir = MessageBuffer()
            dev_dir_cntrl.requestToDir.slave = ruby_system.network.master
            dev_dir_cntrl.dmaRequestToDir = MessageBuffer(ordered=True)
            dev_dir_cntrl.dmaRequestToDir.slave = ruby_system.network.master
            dev_dir_cntrl.responseFromMemory = MessageBuffer()

            dev_mem_ctrl = MemConfig.create_mem_ctrl(
                MemConfig.get(options.mem_type),
                system.gpu.gpu_memory_range, i, options.num_dev_dirs,
                int(math.log(options.num_dev_dirs, 2)), options.cacheline_size)
            dev_mem_ctrl.port = dev_dir_cntrl.memory
            dev_mem_ctrls.append(dev_mem_ctrl)

            topology.addController(dev_dir_cntrl)

        system.dev_mem_ctrls = dev_mem_ctrls

    #
    # Create controller for the copy engine to connect to in GPU cluster
    # Cache is unused by controller
    #
    block_size_bits = int(math.log(options.cacheline_size, 2))
    l1i_cache = L1Cache(size="2kB", assoc=2)
    l1d_cache = L1Cache(size="2kB", assoc=2)
    l2_cache = L2Cache(size="2kB", assoc=2, start_index_bit=block_size_bits)

    l1_cntrl = L1Cache_Controller(version = options.num_cpus + options.num_sc,
                                      L1Icache = l1i_cache,
                                      L1Dcache = l1d_cache,
                                      L2cache = l2_cache,
                                      no_mig_atomic = not \
                                          options.allow_atomic_migration,
                                      send_evictions = False,
                                      transitions_per_cycle = options.ports,
                                      ruby_system = ruby_system)

    gpu_ce_seq = RubySequencer(version=options.num_cpus + options.num_sc,
                               icache=l1i_cache,
                               dcache=l1d_cache,
                               max_outstanding_requests=64,
                               ruby_system=ruby_system,
                               connect_to_io=False)

    l1_cntrl.sequencer = gpu_ce_seq

    ruby_system.dev_ce_cntrl = l1_cntrl

    cpu_sequencers.append(gpu_ce_seq)
    topology.addController(l1_cntrl)

    # Connect the L1 controller and the network
    # Connect the buffers from the controller to network
    l1_cntrl.requestFromCache = MessageBuffer()
    l1_cntrl.requestFromCache.master = ruby_system.network.slave
    l1_cntrl.responseFromCache = MessageBuffer()
    l1_cntrl.responseFromCache.master = ruby_system.network.slave
    l1_cntrl.unblockFromCache = MessageBuffer()
    l1_cntrl.unblockFromCache.master = ruby_system.network.slave

    l1_cntrl.triggerQueue = MessageBuffer()

    # Connect the buffers from the network to the controller
    l1_cntrl.mandatoryQueue = MessageBuffer()
    l1_cntrl.forwardToCache = MessageBuffer()
    l1_cntrl.forwardToCache.slave = ruby_system.network.master
    l1_cntrl.responseToCache = MessageBuffer()
    l1_cntrl.responseToCache.slave = ruby_system.network.master

    return (cpu_sequencers, dir_cntrl_nodes, topology)