Exemple #1
0
def config_hybrid_mem(options, system):
    """
    Assign proper address ranges for DRAM and NVM controllers.
    Create memory controllers and add their shared bus to the system.
    """
    system.thnvm_bus = VirtualXBar()
    mem_ctrls = []

    # The default behaviour is to interleave memory channels on 128
    # byte granularity, or cache line granularity if larger than 128
    # byte. This value is based on the locality seen across a large
    # range of workloads.
    intlv_size = max(128, system.cache_line_size.value)

    total_size = Addr(options.mem_size)
    dram_size = pow(2, options.page_bits) * options.ptt_length

    if dram_size < total_size.value:
        nvm_cls = MemConfig.get(options.nvm_type)
        nvm_range = AddrRange(0, total_size - dram_size - 1)
        nvm_ctrl = MemConfig.create_mem_ctrl(nvm_cls, nvm_range, 0, 1, 0,
                                             intlv_size)
        # Set the number of ranks based on the command-line
        # options if it was explicitly set
        if issubclass(nvm_cls, DRAMCtrl) and options.mem_ranks:
            nvm_ctrl.ranks_per_channel = options.mem_ranks

        mem_ctrls.append(nvm_ctrl)

    if dram_size > 0:
        dram_cls = MemConfig.get(options.dram_type)
        dram_range = AddrRange(total_size - dram_size, total_size - 1)
        dram_ctrl = MemConfig.create_mem_ctrl(dram_cls, dram_range, 0, 1, 0,
                                              intlv_size)
        # Set the number of ranks based on the command-line
        # options if it was explicitly set
        if issubclass(dram_cls, DRAMCtrl) and options.mem_ranks:
            dram_ctrl.ranks_per_channel = options.mem_ranks

        mem_ctrls.append(dram_ctrl)

    system.mem_ctrls = mem_ctrls

    # Connect the controllers to the THNVM bus
    for i in xrange(len(system.mem_ctrls)):
        system.mem_ctrls[i].port = system.thnvm_bus.master

    system.thnvm_bus.slave = system.membus.master
Exemple #2
0
def config_hybrid_mem(options, system):
    """
    Assign proper address ranges for DRAM and NVM controllers.
    Create memory controllers and add their shared bus to the system.
    """
    system.thnvm_bus = VirtualXBar()
    mem_ctrls = []

    # The default behaviour is to interleave memory channels on 128
    # byte granularity, or cache line granularity if larger than 128
    # byte. This value is based on the locality seen across a large
    # range of workloads.
    intlv_size = max(128, system.cache_line_size.value)

    total_size = Addr(options.mem_size)
    dram_size = pow(2, options.page_bits) * options.ptt_length

    if dram_size < total_size.value:
        nvm_cls = MemConfig.get(options.nvm_type)
        nvm_range = AddrRange(0, total_size - dram_size - 1)
        nvm_ctrl = MemConfig.create_mem_ctrl(nvm_cls, nvm_range,
                                             0, 1, 0, intlv_size)
        # Set the number of ranks based on the command-line
        # options if it was explicitly set
        if issubclass(nvm_cls, DRAMCtrl) and options.mem_ranks:
            nvm_ctrl.ranks_per_channel = options.mem_ranks

        mem_ctrls.append(nvm_ctrl)

    if dram_size > 0:
        dram_cls = MemConfig.get(options.dram_type)
        dram_range = AddrRange(total_size - dram_size, total_size - 1)
        dram_ctrl = MemConfig.create_mem_ctrl(dram_cls, dram_range,
                                              0, 1, 0, intlv_size)
        # Set the number of ranks based on the command-line
        # options if it was explicitly set
        if issubclass(dram_cls, DRAMCtrl) and options.mem_ranks:
            dram_ctrl.ranks_per_channel = options.mem_ranks

        mem_ctrls.append(dram_ctrl)

    system.mem_ctrls = mem_ctrls

    # Connect the controllers to the THNVM bus
    for i in xrange(len(system.mem_ctrls)):
        system.mem_ctrls[i].port = system.thnvm_bus.master

    system.thnvm_bus.slave = system.membus.master
Exemple #3
0
def setup_memory_controllers(system, ruby, dir_cntrls, options):
    ruby.block_size_bytes = options.cacheline_size
    ruby.memory_size_bits = 48
    block_size_bits = int(math.log(options.cacheline_size, 2))

    if options.numa_high_bit:
        numa_bit = options.numa_high_bit
    else:
        # if the numa_bit is not specified, set the directory bits as the
        # lowest bits above the block offset bits, and the numa_bit as the
        # highest of those directory bits
        dir_bits = int(math.log(options.num_dirs, 2))
        numa_bit = block_size_bits + dir_bits - 1

    index = 0
    mem_ctrls = []
    crossbars = []

    # Sets bits to be used for interleaving.  Creates memory controllers
    # attached to a directory controller.  A separate controller is created
    # for each address range as the abstract memory can handle only one
    # contiguous address range as of now.
    for dir_cntrl in dir_cntrls:
        dir_cntrl.directory.numa_high_bit = numa_bit

        crossbar = None
        if buildEnv['TARGET_ISA'] != "arm":
            if len(system.mem_ranges) > 1:
                crossbar = IOXBar()
                crossbars.append(crossbar)
                dir_cntrl.memory = crossbar.slave
        else:
            #connect to iobus crossbar
            dir_cntrl.memory = system.iobus.slave

        for r in system.mem_ranges:
            mem_ctrl = MemConfig.create_mem_ctrl(
                MemConfig.get(options.mem_type), r, index, options.num_dirs,
                int(math.log(options.num_dirs, 2)), options.cacheline_size)

            mem_ctrls.append(mem_ctrl)

            if buildEnv['TARGET_ISA'] != "arm":
                if crossbar != None:
                    mem_ctrl.port = crossbar.master
                else:
                    mem_ctrl.port = dir_cntrl.memory
            else:
                #ARM
                mem_ctrl.port = system.iobus.master

        index += 1

    system.mem_ctrls = mem_ctrls

    if buildEnv['TARGET_ISA'] != "arm":
        if len(crossbars) > 0:
            ruby.crossbars = crossbars
Exemple #4
0
def setup_memory_controllers(system, ruby, dir_cntrls, options):
    ruby.block_size_bytes = options.cacheline_size
    ruby.memory_size_bits = 48
    block_size_bits = int(math.log(options.cacheline_size, 2))

    if options.numa_high_bit:
        numa_bit = options.numa_high_bit
    else:
        # if the numa_bit is not specified, set the directory bits as the
        # lowest bits above the block offset bits, and the numa_bit as the
        # highest of those directory bits
        dir_bits = int(math.log(options.num_dirs, 2))
        numa_bit = block_size_bits + dir_bits - 1

    index = 0
    mem_ctrls = []
    crossbars = []

    # Sets bits to be used for interleaving.  Creates memory controllers
    # attached to a directory controller.  A separate controller is created
    # for each address range as the abstract memory can handle only one
    # contiguous address range as of now.
    for dir_cntrl in dir_cntrls:
        dir_cntrl.directory.numa_high_bit = numa_bit

        crossbar = None
        if len(system.mem_ranges) > 1:
            crossbar = NoncoherentXBar()
            crossbars.append(crossbar)
            dir_cntrl.memory = crossbar.slave

        for r in system.mem_ranges:
            mem_ctrl = MemConfig.create_mem_ctrl(
                MemConfig.get(options.mem_type), r, index, options.num_dirs,
                int(math.log(options.num_dirs, 2)), options.cacheline_size)

            mem_ctrls.append(mem_ctrl)

            if crossbar != None:
                mem_ctrl.port = crossbar.master
            else:
                mem_ctrl.port = dir_cntrl.memory

        index += 1

    system.mem_ctrls = mem_ctrls

    if len(crossbars) > 0:
        ruby.crossbars = crossbars
Exemple #5
0
def setup_memory_controllers(system, ruby, dir_cntrls, options):
    ruby.block_size_bytes = options.cacheline_size
    ruby.memory_size_bits = 48
    block_size_bits = int(math.log(options.cacheline_size, 2))

    if options.numa_high_bit:
        numa_bit = options.numa_high_bit
    else:
        # if the numa_bit is not specified, set the directory bits as the
        # lowest bits above the block offset bits, and the numa_bit as the
        # highest of those directory bits
        dir_bits = int(math.log(options.num_dirs, 2))
        numa_bit = block_size_bits + dir_bits - 1

    index = 0
    mem_ctrls = []
    crossbars = []

    # Sets bits to be used for interleaving.  Creates memory controllers
    # attached to a directory controller.  A separate controller is created
    # for each address range as the abstract memory can handle only one
    # contiguous address range as of now.
    for dir_cntrl in dir_cntrls:
        # Create 1 instance of DRAMCache per directory controller
        if options.dramcache:
            dramcache_ctrl = MemConfig.create_dramcache_ctrl(
                MemConfig.get_cache(options.dramcache_type),
                system.mem_ranges[0], index, options.num_dirs,
                options.dramcache_size, options.dramcache_assoc,
                options.dramcache_block_size, options.num_cpus,
                options.dramcache_timing)

            mem_ctrls.append(dramcache_ctrl)
            dir_cntrl.memory = dramcache_ctrl.port

        dir_cntrl.directory.numa_high_bit = numa_bit

        crossbar = None
        if len(system.mem_ranges) > 1:
            # we dont support this
            fatal("system mem_ranges greater than 1")
            crossbar = IOXBar()
            crossbars.append(crossbar)
            if options.dramcache:
                dramcache_ctrl.dramcache_masterport = crossbar.slave
            else:
                dir_cntrl.memory = crossbar.slave

        for r in system.mem_ranges:
            # if dramcache exists interleave at dramcache_block_size
            if options.dramcache:
                mem_ctrl = MemConfig.create_mem_ctrl(
                    MemConfig.get(options.mem_type),
                    r, index, options.num_dirs,
                    int(math.log(options.num_dirs,
                                 2)), options.dramcache_block_size)
            else:
                mem_ctrl = MemConfig.create_mem_ctrl(
                    MemConfig.get(options.mem_type),
                    r, index, options.num_dirs,
                    int(math.log(options.num_dirs, 2)), options.cacheline_size)

            mem_ctrls.append(mem_ctrl)

            if crossbar != None:
                mem_ctrl.port = crossbar.master
            else:
                if options.dramcache:
                    mem_ctrl.port = dramcache_ctrl.dramcache_masterport
                else:
                    mem_ctrl.port = dir_cntrl.memory

        index += 1

    system.mem_ctrls = mem_ctrls

    if len(crossbars) > 0:
        ruby.crossbars = crossbars
Exemple #6
0
def create_system(options, full_system, system, dma_devices, ruby_system):

    if not buildEnv['GPGPU_SIM']:
        m5.util.panic(
            "This script requires GPGPU-Sim integration to be built.")

    # Run the protocol script to setup CPU cluster, directory and DMA
    (all_sequencers, dir_cntrls, dma_cntrls, cpu_cluster) = \
                                        VI_hammer.create_system(options,
                                                                full_system,
                                                                system,
                                                                dma_devices,
                                                                ruby_system)

    # If we're going to split the directories/memory controllers
    if options.num_dev_dirs > 0:
        cpu_cntrl_count = len(cpu_cluster)
    else:
        cpu_cntrl_count = len(cpu_cluster) + len(dir_cntrls)

    #
    # Create controller for the copy engine to connect to in CPU cluster
    # Cache is unused by controller
    #
    cache = L1Cache(size="4096B", assoc=2)

    cpu_ce_seq = RubySequencer(version=options.num_cpus + options.num_sc,
                               icache=cache,
                               dcache=cache,
                               max_outstanding_requests=64,
                               ruby_system=ruby_system,
                               connect_to_io=False)

    cpu_ce_cntrl = GPUCopyDMA_Controller(version=0,
                                         sequencer=cpu_ce_seq,
                                         number_of_TBEs=256,
                                         transitions_per_cycle=options.ports,
                                         ruby_system=ruby_system)

    cpu_ce_cntrl.responseFromDir = MessageBuffer(ordered=True)
    cpu_ce_cntrl.responseFromDir.slave = ruby_system.network.master
    cpu_ce_cntrl.reqToDirectory = MessageBuffer(ordered=True)
    cpu_ce_cntrl.reqToDirectory.master = ruby_system.network.slave

    cpu_ce_cntrl.mandatoryQueue = MessageBuffer()

    ruby_system.ce_cntrl = cpu_ce_cntrl

    cpu_cntrl_count += 1

    #
    # Build GPU cluster
    #
    gpu_cluster = Cluster(intBW=32, extBW=32)
    gpu_cluster.disableConnectToParent()

    l2_bits = int(math.log(options.num_l2caches, 2))
    block_size_bits = int(math.log(options.cacheline_size, 2))
    # This represents the L1 to L2 interconnect latency
    # NOTE! This latency is in Ruby (cache) cycles, not SM cycles
    per_hop_interconnect_latency = 45  # ~15 GPU cycles
    num_dance_hall_hops = int(math.log(options.num_sc, 2))
    if num_dance_hall_hops == 0:
        num_dance_hall_hops = 1
    l1_to_l2_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops

    #
    # Caches for GPU cores
    #
    for i in xrange(options.num_sc):
        #
        # First create the Ruby objects associated with the GPU cores
        #
        cache = L1Cache(size=options.sc_l1_size,
                        assoc=options.sc_l1_assoc,
                        replacement_policy=LRUReplacementPolicy(),
                        start_index_bit=block_size_bits,
                        dataArrayBanks=4,
                        tagArrayBanks=4,
                        dataAccessLatency=4,
                        tagAccessLatency=4,
                        resourceStalls=False)

        l1_cntrl = GPUL1Cache_Controller(
            version=i,
            cache=cache,
            l2_select_num_bits=l2_bits,
            num_l2=options.num_l2caches,
            transitions_per_cycle=options.ports,
            issue_latency=l1_to_l2_noc_latency,
            number_of_TBEs=options.gpu_l1_buf_depth,
            ruby_system=ruby_system)

        gpu_seq = RubySequencer(
            version=options.num_cpus + i,
            icache=cache,
            dcache=cache,
            max_outstanding_requests=options.gpu_l1_buf_depth,
            ruby_system=ruby_system,
            deadlock_threshold=2000000,
            connect_to_io=False)

        l1_cntrl.sequencer = gpu_seq

        exec("ruby_system.l1_cntrl_sp%02d = l1_cntrl" % i)

        #
        # Add controllers and sequencers to the appropriate lists
        #
        all_sequencers.append(gpu_seq)
        gpu_cluster.add(l1_cntrl)

        # Connect the controller to the network
        l1_cntrl.requestFromL1Cache = MessageBuffer(ordered=True)
        l1_cntrl.requestFromL1Cache.master = ruby_system.network.slave
        l1_cntrl.responseToL1Cache = MessageBuffer(ordered=True)
        l1_cntrl.responseToL1Cache.slave = ruby_system.network.master

        l1_cntrl.mandatoryQueue = MessageBuffer()

    l2_index_start = block_size_bits + l2_bits
    # Use L2 cache and interconnect latencies to calculate protocol latencies
    # NOTE! These latencies are in Ruby (cache) cycles, not SM cycles
    l2_cache_access_latency = 30  # ~10 GPU cycles
    l2_to_l1_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops
    l2_to_mem_noc_latency = 125  # ~40 GPU cycles

    l2_clusters = []
    for i in xrange(options.num_l2caches):
        #
        # First create the Ruby objects associated with this cpu
        #
        l2_cache = L2Cache(size=options.sc_l2_size,
                           assoc=options.sc_l2_assoc,
                           start_index_bit=l2_index_start,
                           replacement_policy=LRUReplacementPolicy(),
                           dataArrayBanks=4,
                           tagArrayBanks=4,
                           dataAccessLatency=4,
                           tagAccessLatency=4,
                           resourceStalls=options.gpu_l2_resource_stalls)

        l2_cntrl = GPUL2Cache_Controller(
            version=i,
            L2cache=l2_cache,
            transitions_per_cycle=options.ports,
            l2_response_latency=l2_cache_access_latency + l2_to_l1_noc_latency,
            l2_request_latency=l2_to_mem_noc_latency,
            cache_response_latency=l2_cache_access_latency,
            ruby_system=ruby_system)

        exec("ruby_system.l2_cntrl%d = l2_cntrl" % i)
        l2_cluster = Cluster(intBW=32, extBW=32)
        l2_cluster.add(l2_cntrl)
        gpu_cluster.add(l2_cluster)
        l2_clusters.append(l2_cluster)

        # Connect the controller to the network
        l2_cntrl.responseToL1Cache = MessageBuffer(ordered=True)
        l2_cntrl.responseToL1Cache.master = ruby_system.network.slave
        l2_cntrl.requestFromCache = MessageBuffer()
        l2_cntrl.requestFromCache.master = ruby_system.network.slave
        l2_cntrl.responseFromCache = MessageBuffer()
        l2_cntrl.responseFromCache.master = ruby_system.network.slave
        l2_cntrl.unblockFromCache = MessageBuffer()
        l2_cntrl.unblockFromCache.master = ruby_system.network.slave

        l2_cntrl.requestFromL1Cache = MessageBuffer(ordered=True)
        l2_cntrl.requestFromL1Cache.slave = ruby_system.network.master
        l2_cntrl.forwardToCache = MessageBuffer()
        l2_cntrl.forwardToCache.slave = ruby_system.network.master
        l2_cntrl.responseToCache = MessageBuffer()
        l2_cntrl.responseToCache.slave = ruby_system.network.master

        l2_cntrl.triggerQueue = MessageBuffer()

    gpu_phys_mem_size = system.gpu.gpu_memory_range.size()

    if options.num_dev_dirs > 0:
        mem_module_size = gpu_phys_mem_size / options.num_dev_dirs

        #
        # determine size and index bits for probe filter
        # By default, the probe filter size is configured to be twice the
        # size of the L2 cache.
        #
        pf_size = MemorySize(options.sc_l2_size)
        pf_size.value = pf_size.value * 2
        dir_bits = int(math.log(options.num_dev_dirs, 2))
        pf_bits = int(math.log(pf_size.value, 2))
        if options.numa_high_bit:
            if options.pf_on or options.dir_on:
                # if numa high bit explicitly set, make sure it does not overlap
                # with the probe filter index
                assert (options.numa_high_bit - dir_bits > pf_bits)

            # set the probe filter start bit to just above the block offset
            pf_start_bit = block_size_bits
        else:
            if dir_bits > 0:
                pf_start_bit = dir_bits + block_size_bits - 1
            else:
                pf_start_bit = block_size_bits

        dev_dir_cntrls = []
        dev_mem_ctrls = []
        num_cpu_dirs = len(dir_cntrls)
        for i in xrange(options.num_dev_dirs):
            #
            # Create the Ruby objects associated with the directory controller
            #

            dir_version = i + num_cpu_dirs

            dir_size = MemorySize('0B')
            dir_size.value = mem_module_size

            pf = ProbeFilter(size=pf_size,
                             assoc=4,
                             start_index_bit=pf_start_bit)

            dev_dir_cntrl = Directory_Controller(version = dir_version,
                                 directory = \
                                 RubyDirectoryMemory( \
                                            version = dir_version,
                                            size = dir_size,
                                            numa_high_bit = \
                                            options.numa_high_bit,
                                            device_directory = True),
                                 probeFilter = pf,
                                 probe_filter_enabled = options.pf_on,
                                 full_bit_dir_enabled = options.dir_on,
                                 transitions_per_cycle = options.ports,
                                 ruby_system = ruby_system)

            if options.recycle_latency:
                dev_dir_cntrl.recycle_latency = options.recycle_latency

            exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i)
            dev_dir_cntrls.append(dev_dir_cntrl)

            # Connect the directory controller to the network
            dev_dir_cntrl.forwardFromDir = MessageBuffer()
            dev_dir_cntrl.forwardFromDir.master = ruby_system.network.slave
            dev_dir_cntrl.responseFromDir = MessageBuffer()
            dev_dir_cntrl.responseFromDir.master = ruby_system.network.slave
            dev_dir_cntrl.dmaResponseFromDir = MessageBuffer(ordered=True)
            dev_dir_cntrl.dmaResponseFromDir.master = ruby_system.network.slave

            dev_dir_cntrl.unblockToDir = MessageBuffer()
            dev_dir_cntrl.unblockToDir.slave = ruby_system.network.master
            dev_dir_cntrl.responseToDir = MessageBuffer()
            dev_dir_cntrl.responseToDir.slave = ruby_system.network.master
            dev_dir_cntrl.requestToDir = MessageBuffer()
            dev_dir_cntrl.requestToDir.slave = ruby_system.network.master
            dev_dir_cntrl.dmaRequestToDir = MessageBuffer(ordered=True)
            dev_dir_cntrl.dmaRequestToDir.slave = ruby_system.network.master

            dev_dir_cntrl.triggerQueue = MessageBuffer(ordered=True)
            dev_dir_cntrl.responseFromMemory = MessageBuffer()

            dev_mem_ctrl = MemConfig.create_mem_ctrl(
                MemConfig.get(options.mem_type),
                system.gpu.gpu_memory_range, i, options.num_dev_dirs,
                int(math.log(options.num_dev_dirs, 2)), options.cacheline_size)
            dev_mem_ctrl.port = dev_dir_cntrl.memory
            dev_mem_ctrls.append(dev_mem_ctrl)

        system.dev_mem_ctrls = dev_mem_ctrls
    else:
        # Since there are no device directories, use CPU directories
        # Fix up the memory sizes of the CPU directories
        num_dirs = len(dir_cntrls)
        add_gpu_mem = gpu_phys_mem_size / num_dirs
        for cntrl in dir_cntrls:
            new_size = cntrl.directory.size.value + add_gpu_mem
            cntrl.directory.size.value = new_size

    #
    # Create controller for the copy engine to connect to in GPU cluster
    # Cache is unused by controller
    #
    cache = L1Cache(size="4096B", assoc=2)

    gpu_ce_seq = RubySequencer(version=options.num_cpus + options.num_sc + 1,
                               icache=cache,
                               dcache=cache,
                               max_outstanding_requests=64,
                               support_inst_reqs=False,
                               ruby_system=ruby_system,
                               connect_to_io=False)

    gpu_ce_cntrl = GPUCopyDMA_Controller(version=1,
                                         sequencer=gpu_ce_seq,
                                         number_of_TBEs=256,
                                         transitions_per_cycle=options.ports,
                                         ruby_system=ruby_system)

    ruby_system.dev_ce_cntrl = gpu_ce_cntrl

    all_sequencers.append(cpu_ce_seq)
    all_sequencers.append(gpu_ce_seq)

    gpu_ce_cntrl.responseFromDir = MessageBuffer(ordered=True)
    gpu_ce_cntrl.responseFromDir.slave = ruby_system.network.master
    gpu_ce_cntrl.reqToDirectory = MessageBuffer(ordered=True)
    gpu_ce_cntrl.reqToDirectory.master = ruby_system.network.slave

    gpu_ce_cntrl.mandatoryQueue = MessageBuffer()

    complete_cluster = Cluster(intBW=32, extBW=32)
    complete_cluster.add(cpu_ce_cntrl)
    complete_cluster.add(gpu_ce_cntrl)
    complete_cluster.add(cpu_cluster)
    complete_cluster.add(gpu_cluster)

    for cntrl in dir_cntrls:
        complete_cluster.add(cntrl)

    for cntrl in dev_dir_cntrls:
        complete_cluster.add(cntrl)

    for cntrl in dma_cntrls:
        complete_cluster.add(cntrl)

    for cluster in l2_clusters:
        complete_cluster.add(cluster)

    return (all_sequencers, dir_cntrls, complete_cluster)
def create_system(options, full_system, system, dma_devices, ruby_system):

    if not buildEnv['GPGPU_SIM']:
        m5.util.panic("This script requires GPGPU-Sim integration to be built.")

    # Run the protocol script to setup CPU cluster, directory and DMA
    (all_sequencers, dir_cntrls, dma_cntrls, cpu_cluster) = \
                                        VI_hammer.create_system(options,
                                                                full_system,
                                                                system,
                                                                dma_devices,
                                                                ruby_system)

    # If we're going to split the directories/memory controllers
    if options.num_dev_dirs > 0:
        cpu_cntrl_count = len(cpu_cluster)
    else:
        cpu_cntrl_count = len(cpu_cluster) + len(dir_cntrls)

    #
    # Create controller for the copy engine to connect to in CPU cluster
    # Cache is unused by controller
    #
    cache = L1Cache(size = "4096B", assoc = 2)

    cpu_ce_seq = RubySequencer(version = options.num_cpus + options.num_sc,
                               icache = cache,
                               dcache = cache,
                               max_outstanding_requests = 64,
                               ruby_system = ruby_system,
                               connect_to_io = False)

    cpu_ce_cntrl = GPUCopyDMA_Controller(version = 0,
                                         sequencer = cpu_ce_seq,
                                         number_of_TBEs = 256,
                                         ruby_system = ruby_system)

    cpu_cntrl_count += 1

    cpu_ce_cntrl.responseFromDir = ruby_system.network.master
    cpu_ce_cntrl.reqToDirectory = ruby_system.network.slave

    #
    # Build GPU cluster
    #
    gpu_cluster = Cluster(intBW = 32, extBW = 32)
    gpu_cluster.disableConnectToParent()

    l2_bits = int(math.log(options.num_l2caches, 2))
    block_size_bits = int(math.log(options.cacheline_size, 2))
    # This represents the L1 to L2 interconnect latency
    # NOTE! This latency is in Ruby (cache) cycles, not SM cycles
    per_hop_interconnect_latency = 45 # ~15 GPU cycles
    num_dance_hall_hops = int(math.log(options.num_sc, 2))
    if num_dance_hall_hops == 0:
        num_dance_hall_hops = 1
    l1_to_l2_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops

    #
    # Caches for GPU cores
    #
    for i in xrange(options.num_sc):
        #
        # First create the Ruby objects associated with the GPU cores
        #
        cache = L1Cache(size = options.sc_l1_size,
                            assoc = options.sc_l1_assoc,
                            replacement_policy = "LRU",
                            start_index_bit = block_size_bits,
                            dataArrayBanks = 4,
                            tagArrayBanks = 4,
                            dataAccessLatency = 4,
                            tagAccessLatency = 4,
                            resourceStalls = False)

        l1_cntrl = GPUL1Cache_Controller(version = i,
                                  cache = cache,
                                  l2_select_num_bits = l2_bits,
                                  num_l2 = options.num_l2caches,
                                  issue_latency = l1_to_l2_noc_latency,
                                  number_of_TBEs = options.gpu_l1_buf_depth,
                                  ruby_system = ruby_system)

        gpu_seq = RubySequencer(version = options.num_cpus + i,
                            icache = cache,
                            dcache = cache,
                            max_outstanding_requests = options.gpu_l1_buf_depth,
                            ruby_system = ruby_system,
                            deadlock_threshold = 2000000,
                            connect_to_io = False)

        l1_cntrl.sequencer = gpu_seq

        exec("ruby_system.l1_cntrl_sp%02d = l1_cntrl" % i)

        #
        # Add controllers and sequencers to the appropriate lists
        #
        all_sequencers.append(gpu_seq)
        gpu_cluster.add(l1_cntrl)

        # Connect the controller to the network
        l1_cntrl.requestFromL1Cache = ruby_system.network.slave
        l1_cntrl.responseToL1Cache = ruby_system.network.master

    l2_index_start = block_size_bits + l2_bits
    # Use L2 cache and interconnect latencies to calculate protocol latencies
    # NOTE! These latencies are in Ruby (cache) cycles, not SM cycles
    l2_cache_access_latency = 30 # ~10 GPU cycles
    l2_to_l1_noc_latency = per_hop_interconnect_latency * num_dance_hall_hops
    l2_to_mem_noc_latency = 125 # ~40 GPU cycles

    l2_clusters = []
    for i in xrange(options.num_l2caches):
        #
        # First create the Ruby objects associated with this cpu
        #
        l2_cache = L2Cache(size = options.sc_l2_size,
                           assoc = options.sc_l2_assoc,
                           start_index_bit = l2_index_start,
                           replacement_policy = "LRU",
                           dataArrayBanks = 4,
                           tagArrayBanks = 4,
                           dataAccessLatency = 4,
                           tagAccessLatency = 4,
                           resourceStalls = options.gpu_l2_resource_stalls)

	region_buffer = regionBuffer_Obj(size = "8MB",
                           assoc = 2^16,
                           start_index_bit = l2_index_start,
                           replacement_policy = "LRU",
                           dataArrayBanks = 4,
                           tagArrayBanks = 4,
                           dataAccessLatency = 4,
                           tagAccessLatency = 4,
                           resourceStalls = options.gpu_l2_resource_stalls,
 			   regionSize = options.region_size)



        l2_cntrl = GPUL2Cache_Controller(version = i,
                                L2cache = l2_cache,
				regionBuffer = region_buffer,
                                l2_response_latency = l2_cache_access_latency +
                                                      l2_to_l1_noc_latency,
                                l2_request_latency = l2_to_mem_noc_latency,
                                cache_response_latency = l2_cache_access_latency,
                                ruby_system = ruby_system)

        exec("ruby_system.l2_cntrl%d = l2_cntrl" % i)
        l2_cluster = Cluster(intBW = 32, extBW = 32)
        l2_cluster.add(l2_cntrl)
        gpu_cluster.add(l2_cluster)
        l2_clusters.append(l2_cluster)

        # Connect the controller to the network
        l2_cntrl.responseToL1Cache = ruby_system.network.slave
        l2_cntrl.requestFromCache = ruby_system.network.slave
        l2_cntrl.responseFromCache = ruby_system.network.slave
        l2_cntrl.unblockFromCache = ruby_system.network.slave

        l2_cntrl.requestFromL1Cache = ruby_system.network.master
        l2_cntrl.forwardToCache = ruby_system.network.master
        l2_cntrl.responseToCache = ruby_system.network.master

    gpu_phys_mem_size = system.gpu.gpu_memory_range.size()

    if options.num_dev_dirs > 0:
        mem_module_size = gpu_phys_mem_size / options.num_dev_dirs

        #
        # determine size and index bits for probe filter
        # By default, the probe filter size is configured to be twice the
        # size of the L2 cache.
        #
        pf_size = MemorySize(options.sc_l2_size)
        pf_size.value = pf_size.value * 2
        dir_bits = int(math.log(options.num_dev_dirs, 2))
        pf_bits = int(math.log(pf_size.value, 2))
        if options.numa_high_bit:
            if options.pf_on or options.dir_on:
                # if numa high bit explicitly set, make sure it does not overlap
                # with the probe filter index
                assert(options.numa_high_bit - dir_bits > pf_bits)

            # set the probe filter start bit to just above the block offset
            pf_start_bit = block_size_bits
        else:
            if dir_bits > 0:
                pf_start_bit = dir_bits + block_size_bits - 1
            else:
                pf_start_bit = block_size_bits

        dev_dir_cntrls = []
        dev_mem_ctrls = []
        num_cpu_dirs = len(dir_cntrls)
        for i in xrange(options.num_dev_dirs):
            #
            # Create the Ruby objects associated with the directory controller
            #

            dir_version = i + num_cpu_dirs

            dir_size = MemorySize('0B')
            dir_size.value = mem_module_size

            pf = ProbeFilter(size = pf_size, assoc = 4,
                             start_index_bit = pf_start_bit)

            dev_dir_cntrl = Directory_Controller(version = dir_version,
                                 directory = \
                                 RubyDirectoryMemory( \
                                            version = dir_version,
                                            size = dir_size,
                                            numa_high_bit = \
                                            options.numa_high_bit,
                                            device_directory = True),
                                 probeFilter = pf,
                                 probe_filter_enabled = options.pf_on,
                                 full_bit_dir_enabled = options.dir_on,
                                 ruby_system = ruby_system)

            if options.recycle_latency:
                dev_dir_cntrl.recycle_latency = options.recycle_latency

            exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i)
            dev_dir_cntrls.append(dev_dir_cntrl)

            # Connect the directory controller to the network
            dev_dir_cntrl.forwardFromDir = ruby_system.network.slave
            dev_dir_cntrl.responseFromDir = ruby_system.network.slave
            dev_dir_cntrl.dmaResponseFromDir = ruby_system.network.slave

            dev_dir_cntrl.unblockToDir = ruby_system.network.master
            dev_dir_cntrl.responseToDir = ruby_system.network.master
            dev_dir_cntrl.requestToDir = ruby_system.network.master
            dev_dir_cntrl.dmaRequestToDir = ruby_system.network.master

            dev_mem_ctrl = MemConfig.create_mem_ctrl(
                MemConfig.get(options.mem_type), system.gpu.gpu_memory_range,
                i, options.num_dev_dirs, int(math.log(options.num_dev_dirs, 2)),
                options.cacheline_size)
            dev_mem_ctrl.port = dev_dir_cntrl.memory
            dev_mem_ctrls.append(dev_mem_ctrl)

        system.dev_mem_ctrls = dev_mem_ctrls
    else:
        # Since there are no device directories, use CPU directories
        # Fix up the memory sizes of the CPU directories
        num_dirs = len(dir_cntrls)
        add_gpu_mem = gpu_phys_mem_size / num_dirs
        for cntrl in dir_cntrls:
            new_size = cntrl.directory.size.value + add_gpu_mem
            cntrl.directory.size.value = new_size

    #
    # Create controller for the copy engine to connect to in GPU cluster
    # Cache is unused by controller
    #
    cache = L1Cache(size = "4096B", assoc = 2)

    gpu_ce_seq = RubySequencer(version = options.num_cpus + options.num_sc + 1,
                               icache = cache,
                               dcache = cache,
                               max_outstanding_requests = 64,
                               support_inst_reqs = False,
                               ruby_system = ruby_system,
                               connect_to_io = False)

    gpu_ce_cntrl = GPUCopyDMA_Controller(version = 1,
                                  sequencer = gpu_ce_seq,
                                  number_of_TBEs = 256,
                                  ruby_system = ruby_system)

    ruby_system.l1_cntrl_ce = gpu_ce_cntrl

    all_sequencers.append(cpu_ce_seq)
    all_sequencers.append(gpu_ce_seq)

    gpu_ce_cntrl.responseFromDir = ruby_system.network.master
    gpu_ce_cntrl.reqToDirectory = ruby_system.network.slave

    complete_cluster = Cluster(intBW = 32, extBW = 32)
    complete_cluster.add(cpu_ce_cntrl)
    complete_cluster.add(gpu_ce_cntrl)
    complete_cluster.add(cpu_cluster)
    complete_cluster.add(gpu_cluster)

    for cntrl in dir_cntrls:
        complete_cluster.add(cntrl)

    for cntrl in dev_dir_cntrls:
        complete_cluster.add(cntrl)

    for cntrl in dma_cntrls:
        complete_cluster.add(cntrl)

    for cluster in l2_clusters:
        complete_cluster.add(cluster)

    return (all_sequencers, dir_cntrls, complete_cluster)
def create_system(options, full_system, system, dma_ports, ruby_system):

    if not buildEnv['GPGPU_SIM']:
        m5.util.panic("This script requires GPGPU-Sim integration to be built.")

    options.access_backing_store = True

    # Run the original protocol script
    buildEnv['PROTOCOL'] = buildEnv['PROTOCOL'].replace('split', 'fusion')
    protocol = buildEnv['PROTOCOL']
    exec "import %s" % protocol
    try:
        (cpu_sequencers, dir_cntrl_nodes, topology) = \
            eval("%s.create_system(options, full_system, system, dma_ports, ruby_system)" % protocol)
    except:
        print "Error: could not create system for ruby protocol inside fusion system %s" % protocol
        raise

    # Faking things to build the rest of the system
    print "Warning!"
    print "Warning: Faking split MOESI_hammer protocol; collecting checkpoints?"
    print "Warning!"

    if options.num_dev_dirs > 0:
        block_size_bits = int(math.log(options.cacheline_size, 2))
        gpu_phys_mem_size = system.gpu.gpu_memory_range.size()
        mem_module_size = gpu_phys_mem_size / options.num_dev_dirs

        #
        # determine size and index bits for probe filter
        # By default, the probe filter size is configured to be twice the
        # size of the L2 cache.
        #
        pf_size = MemorySize(options.sc_l2_size)
        pf_size.value = pf_size.value * 2
        dir_bits = int(math.log(options.num_dev_dirs, 2))
        pf_bits = int(math.log(pf_size.value, 2))
        if options.numa_high_bit:
            if options.pf_on or options.dir_on:
                # if numa high bit explicitly set, make sure it does not overlap
                # with the probe filter index
                assert(options.numa_high_bit - dir_bits > pf_bits)

            # set the probe filter start bit to just above the block offset
            pf_start_bit = block_size_bits
        else:
            if dir_bits > 0:
                pf_start_bit = dir_bits + block_size_bits - 1
            else:
                pf_start_bit = block_size_bits

        dev_dir_cntrls = []
        dev_mem_ctrls = []
        num_cpu_dirs = len(dir_cntrl_nodes)
        for i in xrange(options.num_dev_dirs):
            #
            # Create the Ruby objects associated with the directory controller
            #

            dir_version = i + num_cpu_dirs

            dir_size = MemorySize('0B')
            dir_size.value = mem_module_size

            pf = ProbeFilter(size = pf_size, assoc = 4,
                             start_index_bit = pf_start_bit)

            dev_dir_cntrl = Directory_Controller(version = dir_version,
                                 directory = \
                                 RubyDirectoryMemory( \
                                            version = dir_version,
                                            size = dir_size,
                                            numa_high_bit = \
                                            options.numa_high_bit,
                                            device_directory = True),
                                 probeFilter = pf,
                                 probe_filter_enabled = options.pf_on,
                                 full_bit_dir_enabled = options.dir_on,
                                 ruby_system = ruby_system)

            if options.recycle_latency:
                dev_dir_cntrl.recycle_latency = options.recycle_latency

            exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i)
            dev_dir_cntrls.append(dev_dir_cntrl)

            # Connect the directory controller to the network
            dev_dir_cntrl.forwardFromDir = ruby_system.network.slave
            dev_dir_cntrl.responseFromDir = ruby_system.network.slave
            dev_dir_cntrl.dmaResponseFromDir = ruby_system.network.slave

            dev_dir_cntrl.unblockToDir = ruby_system.network.master
            dev_dir_cntrl.responseToDir = ruby_system.network.master
            dev_dir_cntrl.requestToDir = ruby_system.network.master
            dev_dir_cntrl.dmaRequestToDir = ruby_system.network.master

            dev_mem_ctrl = MemConfig.create_mem_ctrl(
                MemConfig.get(options.mem_type), system.gpu.gpu_memory_range,
                i, options.num_dev_dirs, int(math.log(options.num_dev_dirs, 2)),
                options.cacheline_size)
            dev_mem_ctrl.port = dev_dir_cntrl.memory
            dev_mem_ctrls.append(dev_mem_ctrl)

            topology.addController(dev_dir_cntrl)

        system.dev_mem_ctrls = dev_mem_ctrls

    #
    # Create controller for the copy engine to connect to in GPU cluster
    # Cache is unused by controller
    #
    block_size_bits = int(math.log(options.cacheline_size, 2))
    l1i_cache = L1Cache(size = "2kB", assoc = 2)
    l1d_cache = L1Cache(size = "2kB", assoc = 2)
    l2_cache = L2Cache(size = "2kB",
                        assoc = 2,
                        start_index_bit = block_size_bits)

    l1_cntrl = L1Cache_Controller(version = options.num_cpus + options.num_sc,
                                      L1Icache = l1i_cache,
                                      L1Dcache = l1d_cache,
                                      L2cache = l2_cache,
                                      no_mig_atomic = not \
                                          options.allow_atomic_migration,
                                      send_evictions = (
                                          options.cpu_type == "detailed"),
                                      ruby_system = ruby_system)

    gpu_ce_seq = RubySequencer(version = options.num_cpus + options.num_sc,
                               icache = l1i_cache,
                               dcache = l1d_cache,
                               max_outstanding_requests = 64,
                               ruby_system = ruby_system,
                               connect_to_io = False)

    l1_cntrl.sequencer = gpu_ce_seq

    ruby_system.l1_cntrl_gpuce = l1_cntrl

    cpu_sequencers.append(gpu_ce_seq)
    topology.addController(l1_cntrl)

    # Connect the L1 controller and the network
    # Connect the buffers from the controller to network
    l1_cntrl.requestFromCache = ruby_system.network.slave
    l1_cntrl.responseFromCache = ruby_system.network.slave
    l1_cntrl.unblockFromCache = ruby_system.network.slave

    # Connect the buffers from the network to the controller
    l1_cntrl.forwardToCache = ruby_system.network.master
    l1_cntrl.responseToCache = ruby_system.network.master

    return (cpu_sequencers, dir_cntrl_nodes, topology)
Exemple #9
0
def create_system(options, full_system, system, dma_ports, ruby_system):

    if not buildEnv['GPGPU_SIM']:
        m5.util.panic(
            "This script requires GPGPU-Sim integration to be built.")

    options.access_backing_store = True

    # Run the original protocol script
    buildEnv['PROTOCOL'] = buildEnv['PROTOCOL'].replace('split', 'fusion')
    protocol = buildEnv['PROTOCOL']
    exec "import %s" % protocol
    try:
        (cpu_sequencers, dir_cntrl_nodes, topology) = \
            eval("%s.create_system(options, full_system, system, dma_ports, ruby_system)" % protocol)
    except:
        print "Error: could not create system for ruby protocol inside fusion system %s" % protocol
        raise

    # Faking things to build the rest of the system
    print "Warning!"
    print "Warning: Faking split MOESI_hammer protocol; collecting checkpoints?"
    print "Warning!"

    if options.num_dev_dirs > 0:
        block_size_bits = int(math.log(options.cacheline_size, 2))
        gpu_phys_mem_size = system.gpu.gpu_memory_range.size()
        mem_module_size = gpu_phys_mem_size / options.num_dev_dirs

        #
        # determine size and index bits for probe filter
        # By default, the probe filter size is configured to be twice the
        # size of the L2 cache.
        #
        pf_size = MemorySize(options.sc_l2_size)
        pf_size.value = pf_size.value * 2
        dir_bits = int(math.log(options.num_dev_dirs, 2))
        pf_bits = int(math.log(pf_size.value, 2))
        if options.numa_high_bit:
            if options.pf_on or options.dir_on:
                # if numa high bit explicitly set, make sure it does not overlap
                # with the probe filter index
                assert (options.numa_high_bit - dir_bits > pf_bits)

            # set the probe filter start bit to just above the block offset
            pf_start_bit = block_size_bits
        else:
            if dir_bits > 0:
                pf_start_bit = dir_bits + block_size_bits - 1
            else:
                pf_start_bit = block_size_bits

        dev_dir_cntrls = []
        dev_mem_ctrls = []
        num_cpu_dirs = len(dir_cntrl_nodes)
        for i in xrange(options.num_dev_dirs):
            #
            # Create the Ruby objects associated with the directory controller
            #

            dir_version = i + num_cpu_dirs

            dir_size = MemorySize('0B')
            dir_size.value = mem_module_size

            pf = ProbeFilter(size=pf_size,
                             assoc=4,
                             start_index_bit=pf_start_bit)

            dev_dir_cntrl = Directory_Controller(version = dir_version,
                                 directory = \
                                 RubyDirectoryMemory( \
                                            version = dir_version,
                                            size = dir_size,
                                            numa_high_bit = \
                                            options.numa_high_bit,
                                            device_directory = True),
                                 probeFilter = pf,
                                 probe_filter_enabled = options.pf_on,
                                 full_bit_dir_enabled = options.dir_on,
                                 transitions_per_cycle = options.ports,
                                 ruby_system = ruby_system)

            if options.recycle_latency:
                dev_dir_cntrl.recycle_latency = options.recycle_latency

            exec("ruby_system.dev_dir_cntrl%d = dev_dir_cntrl" % i)
            dev_dir_cntrls.append(dev_dir_cntrl)

            # Connect the directory controller to the network
            dev_dir_cntrl.forwardFromDir = MessageBuffer()
            dev_dir_cntrl.forwardFromDir.master = ruby_system.network.slave
            dev_dir_cntrl.responseFromDir = MessageBuffer()
            dev_dir_cntrl.responseFromDir.master = ruby_system.network.slave
            dev_dir_cntrl.dmaResponseFromDir = MessageBuffer(ordered=True)
            dev_dir_cntrl.dmaResponseFromDir.master = ruby_system.network.slave

            dev_dir_cntrl.triggerQueue = MessageBuffer(ordered=True)

            dev_dir_cntrl.unblockToDir = MessageBuffer()
            dev_dir_cntrl.unblockToDir.slave = ruby_system.network.master
            dev_dir_cntrl.responseToDir = MessageBuffer()
            dev_dir_cntrl.responseToDir.slave = ruby_system.network.master
            dev_dir_cntrl.requestToDir = MessageBuffer()
            dev_dir_cntrl.requestToDir.slave = ruby_system.network.master
            dev_dir_cntrl.dmaRequestToDir = MessageBuffer(ordered=True)
            dev_dir_cntrl.dmaRequestToDir.slave = ruby_system.network.master
            dev_dir_cntrl.responseFromMemory = MessageBuffer()

            dev_mem_ctrl = MemConfig.create_mem_ctrl(
                MemConfig.get(options.mem_type),
                system.gpu.gpu_memory_range, i, options.num_dev_dirs,
                int(math.log(options.num_dev_dirs, 2)), options.cacheline_size)
            dev_mem_ctrl.port = dev_dir_cntrl.memory
            dev_mem_ctrls.append(dev_mem_ctrl)

            topology.addController(dev_dir_cntrl)

        system.dev_mem_ctrls = dev_mem_ctrls

    #
    # Create controller for the copy engine to connect to in GPU cluster
    # Cache is unused by controller
    #
    block_size_bits = int(math.log(options.cacheline_size, 2))
    l1i_cache = L1Cache(size="2kB", assoc=2)
    l1d_cache = L1Cache(size="2kB", assoc=2)
    l2_cache = L2Cache(size="2kB", assoc=2, start_index_bit=block_size_bits)

    l1_cntrl = L1Cache_Controller(version = options.num_cpus + options.num_sc,
                                      L1Icache = l1i_cache,
                                      L1Dcache = l1d_cache,
                                      L2cache = l2_cache,
                                      no_mig_atomic = not \
                                          options.allow_atomic_migration,
                                      send_evictions = False,
                                      transitions_per_cycle = options.ports,
                                      ruby_system = ruby_system)

    gpu_ce_seq = RubySequencer(version=options.num_cpus + options.num_sc,
                               icache=l1i_cache,
                               dcache=l1d_cache,
                               max_outstanding_requests=64,
                               ruby_system=ruby_system,
                               connect_to_io=False)

    l1_cntrl.sequencer = gpu_ce_seq

    ruby_system.dev_ce_cntrl = l1_cntrl

    cpu_sequencers.append(gpu_ce_seq)
    topology.addController(l1_cntrl)

    # Connect the L1 controller and the network
    # Connect the buffers from the controller to network
    l1_cntrl.requestFromCache = MessageBuffer()
    l1_cntrl.requestFromCache.master = ruby_system.network.slave
    l1_cntrl.responseFromCache = MessageBuffer()
    l1_cntrl.responseFromCache.master = ruby_system.network.slave
    l1_cntrl.unblockFromCache = MessageBuffer()
    l1_cntrl.unblockFromCache.master = ruby_system.network.slave

    l1_cntrl.triggerQueue = MessageBuffer()

    # Connect the buffers from the network to the controller
    l1_cntrl.mandatoryQueue = MessageBuffer()
    l1_cntrl.forwardToCache = MessageBuffer()
    l1_cntrl.forwardToCache.slave = ruby_system.network.master
    l1_cntrl.responseToCache = MessageBuffer()
    l1_cntrl.responseToCache.slave = ruby_system.network.master

    return (cpu_sequencers, dir_cntrl_nodes, topology)