Ejemplo n.º 1
0
    def __init__(self, analysis_case: AnalysisCase, arch_case: Design):
        super(OverheadEval, self).__init__(analysis_case, arch_case)
        self.total_memory_access = self.get_total_memory_access()

        self.OL2_size: size.Size = size.B((cfg.OL1_choices_map[self.memcase.OL1].W \
            * cfg.OL1_choices_map[self.memcase.OL1].H) * self.computecase.lane * self.computecase.core * self.computecase.chiplet)

        self.real_WL1 = size.B(self.memcase.WL1.to_b() / 8 / (self.memcase.loopParameter.chiplet_spatial_parameter.Wc \
            * self.memcase.loopParameter.chiplet_spatial_parameter.Hc))
Ejemplo n.º 2
0
    def get_mem_footprint(self):
        A_l1_memory = (self.memcase.AL1.to_b() * self.computecase.core)
        W_l1_memory = self.memcase.WL1.to_b() / ( self.memcase.loopParameter.chiplet_spatial_parameter.Wc \
            * self.memcase.loopParameter.chiplet_spatial_parameter.Hc) * self.computecase.lane * self.computecase.core
        A_l2_memory = self.memcase.AL2.to_b()
        o_l1_memory = size.B(
            192).to_b() * self.computecase.lane * self.computecase.core
        o_l2_memory = (size.B(8 * 8).to_b() * self.computecase.lane *
                       self.computecase.core)

        total_memory = self.computecase.chiplet * (A_l1_memory + W_l1_memory +
                                                   A_l2_memory + o_l1_memory +
                                                   o_l2_memory) / (8192)

        return total_memory
Ejemplo n.º 3
0
    def get_chiplet_communication(self):
        C0, C1, W1, W2, H1, H2, K1, K2 = self.memcase.loopParameter.get_temporal_count(
        )
        Kp, Hp, _, Wc, Hc = self.memcase.loopParameter.get_spatial_count()
        stride = self.memcase.workload.stride
        kernel_size = self.memcase.workload.kernel_size
        OL1 = cfg.OL1_choices_map[self.memcase.OL1]

        # MUSE-V3 does not support weight roration reuse
        W: size.Size = size.B(0)

        if self.memcase.loopParameter.rotation_enable:
            A: size.Size = size.B(2 * Kp * (Kp - 1) * OL1.in_tile(kernel_size, stride).size() \
                * C0 * C1 * K2 * K1 * Hp * W1 * H1 * W2 * H2 * Wc * Hc)
        else:
            A: size.Size = size.B(0)

        return W + A
Ejemplo n.º 4
0
def get_analysis_cases(design: Design,
                       workload: Workload) -> List[AnalysisCase]:
    from config import ACT_MEMORY_ALIGN, rotation_search_list
    from config import AL1_choices, WL1_choices, OL1_choices_map, AL2_choices, W1H1_choices_map

    result: List[AnalysisCase] = []
    # To pick the corresponding package-spatial division cases
    package_spatial_parameter_map = get_package_spatial_parameters()
    package_spatial_parameters = package_spatial_parameter_map[design.chiplet]
    K0 = design.lane

    for package_spatial_parameter in package_spatial_parameters:
        Kp = package_spatial_parameter.Kp
        Hp = package_spatial_parameter.Hp
        Fw = workload.kernel_size.W
        Fh = workload.kernel_size.H
        C0 = design.vector

        chiplet_workload_W: int = ceil(workload.out_size.W / 1)
        chiplet_workload_H: int = ceil(workload.out_size.H / Hp)
        chiplet_workload_K: int = ceil(workload.out_channel / Kp)

        for WL1 in WL1_choices:  # This loop is no meaning is the post-design flow (i.e., for MUSE-V3)
            # Chiplet-level spatial division: Kc, Hc, Wc
            chiplet_spatial_parameter_map = get_chiplet_spatial_parameters()
            chiplet_spatial_parameters = chiplet_spatial_parameter_map[
                design.core]

            # Temp WL1 to avoid overwrite the original value in the following iteration
            WL1_temp = WL1

            for chiplet_spatial_parameter in chiplet_spatial_parameters:  # To generate different packae-level spatial division

                Kc = chiplet_spatial_parameter.Kc
                Wc = chiplet_spatial_parameter.Wc
                Hc = chiplet_spatial_parameter.Hc

                # In MUSE-V3, if weight data can be shared by multi-cores, their local WL1 can be fused to
                # form a larger buffer. Wc * Hc refers to the number of shared cores
                WL1 = WL1_temp * Wc * Hc

                for OL1, BasicSize in OL1_choices_map.items(
                ):  # To generate different basic output-tile size
                    OL1: size.Size
                    for AL2 in AL2_choices:  # This loop is no meaning is the post-design flow (i.e., for MUSE-V3)
                        AL2: size.Size

                        chiplet_workload_in = TileSize(
                            chiplet_workload_W, chiplet_workload_H).in_tile(
                                workload.kernel_size, workload.stride)
                        # Adapt multiple basic-tile (mini-tile) for a core (plane dimension)
                        n = 1
                        # TODO: To support more cases for the total number of mini-tiles (HW can do it)
                        for i in [
                                2, 4, 8, 16
                        ]:  # To find the different Level-1 temporal cases (for H & W)
                            W1H1_choice = W1H1_choices_map[i]
                            sub_tile_in = TileSize(
                                BasicSize.H * W1H1_choice.H,
                                BasicSize.W * W1H1_choice.W).in_tile(
                                    workload.kernel_size, workload.stride)
                            tile_in = TileSize(Hc * sub_tile_in.H,
                                               Wc * sub_tile_in.W).in_tile(
                                                   workload.kernel_size,
                                                   workload.stride)

                            # To check whether the tile size is larger than the chiplet workload
                            if tile_in.W > chiplet_workload_in.W or tile_in.H > chiplet_workload_in.H:
                                break
                            # The tile_in needs to be fit in the AL2
                            # Align by the vector_size
                            if AL2 >= size.B(
                                    tile_in.W * tile_in.H *
                                    ceil(workload.in_channel / C0) * C0):
                                n = i
                            else:
                                break
                        W1H1_choice = W1H1_choices_map[n]
                        W1 = W1H1_choice.W
                        H1 = W1H1_choice.H

                        # Adapt multiple basic-tile (mini-tile) for a core (channel dimension)
                        K1 = 1  # Initialization
                        # It can be larger
                        # TODO: j can be any integer, e.g., j=3. But it needs to handle the margin case.
                        for j in [
                                1, 2, 4, 8, 16
                        ]:  # To find the different Level-1 temporal cases (for K)
                            # To check whether the tile channel is larger than the chiplet workload
                            if Kp * Kc * K0 * j >= chiplet_workload_K:
                                break

                            # The tile_in needs to be fit in the WL1
                            # Align by the vector_size
                            if WL1 >= size.B(
                                    Fw * Fh * ceil(workload.in_channel / C0) *
                                    C0 * j):
                                K1 = j
                            else:
                                break

                        # Generate Loop Parameters and Analysis Cases
                        tile_W = BasicSize.W * W1 * Wc
                        tile_H = BasicSize.H * H1 * Hc
                        tile_K = K0 * K1 * Kc

                        W2: int = ceil(chiplet_workload_W / tile_W)
                        H2: int = ceil(chiplet_workload_H / tile_H)
                        K2: int = ceil(chiplet_workload_K / tile_K)

                        for rotation_enable in rotation_search_list:  # To generate rotation or non-rotation cases

                            if rotation_enable:
                                if workload.in_channel % (ACT_MEMORY_ALIGN *
                                                          design.chiplet) != 0:
                                    continue
                                else:
                                    aligned_CI = ceil(workload.in_channel /
                                                      (C0 * Kp)) * C0 * Kp
                            else:
                                aligned_CI = ceil(
                                    workload.in_channel / C0) * C0

                            aligned_workload = Workload(
                                aligned_CI, tile_K * K2 * Kp,
                                Block(tile_H * H2, tile_W * W2),
                                workload.kernel_size, workload.stride)

                            loopParameter = LoopParameter(W1, H1, K1, W2, H2, K2, package_spatial_parameter, \
                                chiplet_spatial_parameter, aligned_workload, design, rotation_enable)

                            for AL1 in AL1_choices:  # This loop is no meaning is the post-design flow (i.e., for MUSE-V3)
                                if AL1 > AL2:
                                    continue
                                for reorderCase in [
                                        ReorderCase(type_n)
                                        for type_n in [1, 2]
                                ]:  # To generate different reordering cases
                                    result.append(
                                        AnalysisCase(OL1, AL1, WL1, AL2,
                                                     reorderCase,
                                                     loopParameter,
                                                     aligned_workload))

    return result
Ejemplo n.º 5
0
# 8b-A, 4b-W Mode Configurations:
# num_chiplets:   List[int] = [4]
# num_cores:      List[int] = [8]
# num_lanes:      List[int] = [16]
# size_vectors:   List[int] = [8]

# 4b-A, 4b-W Mode Configurations:
# num_chiplets:   List[int] = [4]
# num_cores:      List[int] = [8]
# num_lanes:      List[int] = [16]
# size_vectors:   List[int] = [16]


# 8b-A, 8b-W Mode Configurations:
AL2_choices: List[size.Size] = [size.B(46080)]
AL1_choices: List[size.Size] = [size.B(8192)]
WL1_choices: List[size.Size] = [size.B(1168)]
# TODO: MUSE-V3 can support more basic sizes
OL1_choices_map: Dict[size.Size, TileSize] = {
    size.B(3): TileSize(1, 1),
    size.B(12): TileSize(2, 2),
    size.B(48): TileSize(4, 4),
    size.B(192): TileSize(8, 8)
}

# 16b-A, 8b-W Mode Configurations:
# AL2_choices: List[size.Size] = [size.B(23040)]
# AL1_choices: List[size.Size] = [size.B(4096)]
# WL1_choices: List[size.Size] = [size.B(1168)]
# TODO: MUSE-V3 can support more basic sizes
Ejemplo n.º 6
0
    def get_energy(self):
        total_runtime = self.get_runtime()
        chiplet_communication = self.get_chiplet_communication()

        Energy_DRAMtoSRAM_A: float = cfg.A_BW_RATIO * self.total_memory_access.AL2_Wr.to_b() * \
            (cfg.Energy_GRS + cfg.Energy_DRAM + cfg.Energy_AL2_Wr)
        Energy_DRAMtoSRAM_W: float = cfg.W_BW_RATIO * self.total_memory_access.WL1_Wr.to_b() * \
            (cfg.Energy_GRS + cfg.Energy_DRAM + cfg.Energy_WL1_Wr)
        Energy_TotalMAC: float = total_runtime * cfg.Energy_MAC * cfg.DATA_WIDTH * (
            self.computecase.lane * self.computecase.vector *
            self.computecase.chiplet * self.computecase.core)

        # Energy breakdown
        DRAM_energy: float = cfg.A_BW_RATIO * self.total_memory_access.OL2_Rd.to_b() * cfg.Energy_DRAM \
                                + cfg.W_BW_RATIO * self.total_memory_access.WL1_Wr.to_b() * cfg.Energy_DRAM \
                                + cfg.A_BW_RATIO * self.total_memory_access.AL2_Wr.to_b() * cfg.Energy_DRAM \
                                + cfg.A_BW_RATIO * self.total_memory_access.AL2_Wr.to_b() * cfg.Energy_GRS \
                                + cfg.W_BW_RATIO * self.total_memory_access.WL1_Wr.to_b() * cfg.Energy_GRS \
                                + cfg.A_BW_RATIO * self.total_memory_access.OL2_Rd.to_b() * cfg.Energy_GRS

        D2D_energy: float = cfg.A_BW_RATIO * chiplet_communication.to_b(
        ) * cfg.Energy_GRS
        A_L2_energy: float = cfg.A_BW_RATIO * self.total_memory_access.AL2_Wr.to_b() * cfg.Energy_AL2_Wr \
                                + cfg.A_BW_RATIO * self.total_memory_access.AL1_Rd.to_b() * cfg.Energy_AL2_Rd

        A_L1_energy: float = cfg.A_BW_RATIO * self.total_memory_access.AL1_Wr.to_b() * cfg.Energy_AL1_Wr \
                                + cfg.A_BW_RATIO * self.total_memory_access.AL1_Rd.to_b() * cfg.Energy_AL1_Rd

        W_L1_energy: float = cfg.W_BW_RATIO * self.total_memory_access.WL1_Wr.to_b() * cfg.Energy_WL1_Wr \
                                + cfg.A_BW_RATIO * self.total_memory_access.WL1_Rd.to_b() * cfg.Energy_WL1_Rd

        output_energy: float = cfg.A_BW_RATIO * self.total_memory_access.OL2_Rd.to_b() * cfg.Energy_OL2_Rd \
                                + cfg.A_BW_RATIO * self.total_memory_access.OL2_Wr.to_b() * cfg.Energy_OL2_Wr \
                                + 4 * self.total_memory_access.OL1_Wr.to_b() * (cfg.get_Energy_RF(size.B(384))) \
                                + 4 * self.total_memory_access.OL1_Rd.to_b() * (cfg.get_Energy_RF(size.B(384)))

        MAC_energy: float = Energy_TotalMAC

        total_energy = DRAM_energy + D2D_energy + A_L2_energy + A_L1_energy + W_L1_energy + output_energy + MAC_energy
        energy_breakdown = EnergyBreakdown(DRAM_energy, D2D_energy,
                                           A_L2_energy, A_L1_energy,
                                           W_L1_energy, output_energy,
                                           Energy_TotalMAC)

        return total_energy, Energy_DRAMtoSRAM_W, Energy_DRAMtoSRAM_A, energy_breakdown
Ejemplo n.º 7
0
    def get_dram_communication(self):
        dram_access = self.total_memory_access.OL2_Rd.to_b(
        ) + self.total_memory_access.WL1_Wr.to_b(
        ) + self.total_memory_access.AL2_Wr.to_b()

        return size.B(dram_access)
Ejemplo n.º 8
0
def search(workload: Workload, writer: csv.DictWriter, note: str):
    for design in tqdm(designs,
                       desc='Try Different Hardware Parallel Designs'):
        sram_cases: List[AnalysisCase] = get_analysis_cases(design, workload)
        for sram_case in sram_cases:
            evaluator = OverheadEval(sram_case, design)
            total_memory, total_memory_access, chiplet_communication, total_runtime, area_per_chiplet, area_per_package, \
                total_energy, Energy_DRAMtoSRAM_W, Energy_DRAMtoSRAM_A, energy_breakdown = evaluator.evaluation()

            dram_access = evaluator.get_dram_communication()

            view_module_energy = 0
            if view_module_energy:
                print("\n")
                print('Chip to Chip: %f' % Energy_GRS)
                print('DRAM to SRAM_A: %f' %
                      (Energy_GRS + Energy_DRAM + Energy_AL2_Wr))
                print('DRAM to SRAM_W: %f' %
                      (Energy_GRS + Energy_DRAM + Energy_WL1_Wr))
                print('AL2 to AL1: %f' % (Energy_AL2_Rd + Energy_AL1_Wr))
                print('AL1 to MAC: %f' % (Energy_WL1_Rd))
                print('WL1 to MAC: %f' % (Energy_AL1_Rd))
                print('MAC to OL1(RF): %f' % (get_Energy_RF(size.B(384))))
                print('OL1(RF) to OL2: %f' %
                      (Energy_OL2_Wr + get_Energy_RF(sram_case.OL1)))
                print('OL2 to DRAM: %f' %
                      (Energy_GRS + Energy_DRAM + Energy_OL2_Rd))
                print('AL1 Size: %s' % str(sram_case.AL1))
                print('AL2 Size: %s' % str(sram_case.AL2))
                print('WL1 Size: %s' % str(sram_case.WL1))
                print('Area of 1 Chiplet: %s' %
                      str(area_per_chiplet / 1000000))
                sys.exit()

            writer.writerow({
                'Chiplet':
                design.chiplet,
                'Core':
                design.core,
                'Lane':
                design.lane,
                'Vector_Size':
                design.vector,
                'cin':
                workload.in_channel,
                'cout':
                workload.out_channel,
                'out_size':
                workload.out_size.H,
                'kernel_size':
                workload.kernel_size.H,
                'stride':
                workload.stride.H,
                'OL1':
                sram_case.OL1.to_b(),
                'AL1':
                sram_case.AL1.to_b(),
                'WL1':
                sram_case.WL1.to_b(),
                'real_WL1':
                evaluator.real_WL1.to_b(),
                'AL2':
                sram_case.AL2.to_b(),
                'reorder_case':
                sram_case.reorderCase.type_n,
                'rotation_enable':
                sram_case.loopParameter.rotation_enable,
                'runtime':
                total_runtime,
                'area-chiplet':
                area_per_chiplet,
                'area-package':
                area_per_package,
                'total_memory_footprint':
                total_memory,
                'total_energy':
                total_energy,
                'chiplet_communication':
                chiplet_communication.to_b(),
                'dram_communication':
                dram_access.to_b(),
                'DRAM_energy_A':
                Energy_DRAMtoSRAM_A,
                'DRAM_energy_W':
                Energy_DRAMtoSRAM_W,
                'DRAM_Energy':
                energy_breakdown.DRAM_energy,
                'Die-to-Die_Energy':
                energy_breakdown.D2D_energy,
                'A-L2_Energy':
                energy_breakdown.A_L2_energy,
                'A-L1_Energy':
                energy_breakdown.A_L1_energy,
                'W-L1_Energy':
                energy_breakdown.W_L1_energy,
                'Output_Energy':
                energy_breakdown.output_energy,
                'MAC_Energy':
                energy_breakdown.Energy_TotalMAC,
                'MOL1':
                total_memory_access.OL1_Wr.to_b(),
                'MAL1':
                total_memory_access.AL1_Wr.to_b(),
                'MWL1':
                total_memory_access.WL1_Wr.to_b(),
                'sMWL1':
                evaluator.get_sram_access().WL1_Wr,
                'MAL2':
                total_memory_access.AL2_Wr.to_b(),
                'X1':
                sram_case.loopParameter.W1,
                'Y1':
                sram_case.loopParameter.H1,
                'K1':
                sram_case.loopParameter.K1,
                'X2':
                sram_case.loopParameter.W2,
                'Y2':
                sram_case.loopParameter.H2,
                'K2':
                sram_case.loopParameter.K2,
                'Kp':
                sram_case.loopParameter.package_spatial_parameter.Kp,
                'Yp':
                sram_case.loopParameter.package_spatial_parameter.Hp,
                'Kc':
                sram_case.loopParameter.chiplet_spatial_parameter.Kc,
                'Yc':
                sram_case.loopParameter.chiplet_spatial_parameter.Hc,
                'Xc':
                sram_case.loopParameter.chiplet_spatial_parameter.Wc,
                'C1':
                sram_case.loopParameter.C1,
                'C0':
                sram_case.loopParameter.C0,
                'Csa':
                sram_case.loopParameter.Csa,
                #  'Ksw': sram_case.loopParameter.Ksw,
                'X0':
                OL1_choices_map[sram_case.OL1].W,
                'Y0':
                OL1_choices_map[sram_case.OL1].H,
                'note':
                note
            })
Ejemplo n.º 9
0
    def get_sram_access(self) -> MemoryAccess:
        Fw, Fh = self.memcase.workload.kernel_size.get_params()
        Csa = self.memcase.loopParameter.get_rotation_count()
        W0, H0 = cfg.OL1_choices_map[self.memcase.OL1].get_params()
        C0, C1, W1, W2, H1, H2, K1, K2 = self.memcase.loopParameter.get_temporal_count(
        )
        _, _, _, Wc, Hc = self.memcase.loopParameter.get_spatial_count()

        c3p_info, Cc_fake = self.c3p_analysis()

        # Memory Write for one WL1
        WL1_Wr = size.B(Fw * Fh * C0 * C1 * Csa * K1 * K2)  # Min access count
        WL1_penalty = self.get_penalty(self.memcase.WL1, c3p_info['WL1'],
                                       1)  # Get the penalty item
        WL1_Wr = WL1_Wr * WL1_penalty  # The real access count
        WL1_Rd = size.B(Fw * Fh * K1 * K2 * C0 * C1 * Csa * W1 * H1 * W2 *
                        H2)  # Weight-stationary

        # OL1 & OL2;  the '4' in OL1 means that the bit-width of partial sums is 32bit
        OL1_Rd = size.B(4 * W0 * H0 * Fw * Fh * C1 * Csa * W1 * H1 * W2 * H2 * K2 * K1 - 1) \
            + size.B(4 * W0 * H0 * W1 * H1 * W2 * H2 * K2 * K1)                         # MAC Read + Read to OL2.
        OL1_Wr = size.B(4 * W0 * H0 * Fw * Fh * C1 * Csa * W1 * H1 * W2 * H2 *
                        K2 * K1)  # MAC update
        OL2_Wr = size.B(W0 * H0 * W1 * H1 * W2 * H2 * K1 * K2 * Wc *
                        Hc)  # Read from OL1 and write to OL2
        OL2_Rd = size.B(W0 * H0 * W1 * H1 * W2 * H2 * K1 * K2 * Wc *
                        Hc)  # Read from OL2 and write to DDR

        # AL1 (just like a pipeline register, reuse for a basic tile of H0 * W0)
        if self.memcase.AL1 < c3p_info['AL1']['Critical_Capacity'][0]:
            raise ValueError(
                'The capacity should larger than Critical-Capacity-0'
            )  # See the Error description
        basic_tile_in = TileSize(W0,
                                 H0).in_tile(self.memcase.workload.kernel_size,
                                             self.memcase.workload.stride)
        H0_in = basic_tile_in.H
        W0_in = basic_tile_in.W

        AL1_Wr = size.B(W0_in * H0_in * c3p_info['AL1']['Penalty'][1] * C0 *
                        C1 * Csa)  # all the temporal loop counts are penalty
        AL1_Rd = size.B(C0 * Csa * C1 * K1 * K2 * W0_in * H0_in * Fw * Fh *
                        W1 * W2 * H1 * H2)

        # AL2
        W0W1Wc_H0H1Hc_in_tile_size = cfg.TileSize(
            H0 * H1 * Hc,
            W0 * W1 * Wc).in_tile(self.memcase.workload.kernel_size,
                                  self.memcase.workload.stride).size()
        W0W1W2Wc_H0H1H2Hc_in_tile_size = cfg.TileSize(
            H0 * H1 * H2 * Hc,
            W0 * W1 * W2 * Wc).in_tile(self.memcase.workload.kernel_size,
                                       self.memcase.workload.stride).size()

        AL2_Wr = W0W1W2Wc_H0H1H2Hc_in_tile_size

        if self.memcase.AL2 < Cc_fake['AL2'][
                2]:  # There are some cases that AL2 can buffer the whole chiplet workload
            AL2_Wr = W0W1Wc_H0H1Hc_in_tile_size * W2 * H2  # If cannot buffer, each tile size is "W0W1Wc_H0H1Hc_in_tile_size"

        AL2_penalty = self.get_penalty(self.memcase.AL2, c3p_info['AL2'], 0)
        AL2_Wr = size.B(AL2_Wr * AL2_penalty * C0 * C1)

        AL2_Rd = AL1_Wr * Hc * Wc  # Broadcast to Kc cores (input reuse)

        return MemoryAccess(WL1_Wr, WL1_Rd, OL1_Wr, OL1_Rd, AL1_Wr, AL1_Rd,
                            AL2_Wr, AL2_Rd, OL2_Wr, OL2_Rd)
Ejemplo n.º 10
0
    def get_c3p_info(self):
        workloadForLoopDescription = self.case.reorderCase.getreorder()

        basic_workload = OL1_choices_map[self.case.OL1]
        W0, H0 = basic_workload.get_params()
        W0_in, H0_in = basic_workload.in_tile(
            self.case.workload.kernel_size,
            self.case.workload.stride).get_params()

        C0, C1, W1, W2, H1, H2, K1, K2 = self.case.loopParameter.get_temporal_count(
        )
        _, _, _, Wc, Hc = self.case.loopParameter.get_spatial_count()
        Csa = self.case.loopParameter.get_rotation_count()

        self.c3p_info = {}
        self.Cc_fake = {}

        ######################################################## WL1 ########################################################
        Fx, Fy = self.case.workload.kernel_size.get_params()
        # The critical point is None if it doesn't make sense
        '''
        WL1 corresponds to one lane.
        Cc0: Once basic workload (for most workload, our MUSE-v3 can satisfy it)
        Cc1: Buffer all input channels (some extreme cases maybe fail to satisfy it and then should consider ping-pong in compiler)
        Cc2: Buffer all mini-tiles with all input channels. 
        In fact, in the generate_case.py, if WL1 >= Cc1, and then can satisfy the Cc2
        Cc3: Buffer all weights
        '''
        self.WL1_Cc0: size.Size = size.B(
            Fx * Fy *
            C0)  # At least to support once mapping of C0 kernels for a lane
        self.WL1_Cc1: Optional[size.Size] = None
        self.WL1_Cc2: Optional[size.Size] = None
        self.WL1_Cc3: Optional[size.Size] = None

        self.WL1_Cc0_Penalty: int = 1
        self.WL1_Cc1_Penalty: int = 1
        self.WL1_Cc2_Penalty: int = 1
        self.WL1_Cc3_Penalty: int = 1

        WL1_Cc2_Penalty_start = 0
        WL1_Cc3_Penalty_start = 0

        WL1_info = {}

        # WL1: Cc1
        self.WL1_Cc1 = self.WL1_Cc0 * C1 * Csa

        # Calculate WL1: Cc1_Penalty
        for forLoopSymbol in reversed(workloadForLoopDescription):
            if forLoopSymbol in WImpactFactors:
                break
            else:
                self.WL1_Cc1_Penalty = self.WL1_Cc1_Penalty * self.case.loopParameter.symbol_to_count(
                    forLoopSymbol)

        # Check WL1: Cc2
        K1Index = workloadForLoopDescription.index(ForLoopSymbol.K1)
        if K1Index != 0:
            for forLoopSymbol in reversed(
                    workloadForLoopDescription[:K1Index]):
                if self.case.loopParameter.symbol_to_count(forLoopSymbol) != 1:
                    # Find the first real loop
                    WL1_Cc2_Penalty_start = workloadForLoopDescription.index(
                        forLoopSymbol) + 1
                    if forLoopSymbol in WImpactFactors:
                        # Cc2 doesn't make sense
                        pass
                    else:
                        self.WL1_Cc2 = self.WL1_Cc0 * C1 * Csa * K1
                    break

        # Calculate WL1: Cc2_Penalty
        WL1_Cc2_Penalty_start = workloadForLoopDescription.index(
            forLoopSymbol.K1)
        if WL1_Cc2_Penalty_start > 0:
            for forLoopSymbol in reversed(
                    workloadForLoopDescription[:WL1_Cc2_Penalty_start]):
                if forLoopSymbol in WImpactFactors:
                    break
                else:
                    self.WL1_Cc2_Penalty = self.WL1_Cc2_Penalty * self.case.loopParameter.symbol_to_count(
                        forLoopSymbol)

        # Check WL1: Cc3
        K2Index = workloadForLoopDescription.index(ForLoopSymbol.K2)
        if K2Index != 0:
            for forLoopSymbol in reversed(
                    workloadForLoopDescription[:K2Index]):
                if self.case.loopParameter.symbol_to_count(forLoopSymbol) != 1:
                    # Find the first real loop
                    WL1_Cc3_Penalty_start = workloadForLoopDescription.index(
                        forLoopSymbol) + 1
                    if forLoopSymbol in WImpactFactors:
                        # Cc3 doesn't make sense
                        pass
                    else:
                        self.WL1_Cc3 = self.WL1_Cc0 * C1 * Csa * K1 * K2
                    break

        # Calculate WL1: Cc3_Penalty
        WL1_Cc3_Penalty_start = workloadForLoopDescription.index(
            forLoopSymbol.K2)
        if WL1_Cc3_Penalty_start > 0:
            for forLoopSymbol in reversed(
                    workloadForLoopDescription[:WL1_Cc3_Penalty_start]):
                if forLoopSymbol in WImpactFactors:
                    break
                else:
                    self.WL1_Cc3_Penalty = self.WL1_Cc3_Penalty * self.case.loopParameter.symbol_to_count(
                        forLoopSymbol)

        WL1_info['Critical_Capacity'] = [
            self.WL1_Cc0, self.WL1_Cc1, self.WL1_Cc2, self.WL1_Cc3
        ]
        WL1_info['Penalty'] = [
            self.WL1_Cc0_Penalty, self.WL1_Cc1_Penalty, self.WL1_Cc2_Penalty,
            self.WL1_Cc3_Penalty
        ]
        self.c3p_info['WL1'] = WL1_info
        #####################################################################################################################

        ######################################################## AL1 ########################################################
        # if (self.case.loopParameter.K1 == 1 and self.case.loopParameter.H1 == 1 and self.case.loopParameter.W1 == 1 \
        #     and self.case.loopParameter.K2 == 2 and self.case.loopParameter.H2 == 8 and self.case.loopParameter.W2 == 8 \
        #     and self.case.workload.stride.H == 2 and self.case.loopParameter.C1 == 16 and self.case.workload.kernel_size.H == 1):
        #     print(1)  # debug
        '''
        In our design, the AL1 is just like a vector-register (very small) in the pipeline.
        Therefore, the Penalty is the number of Basic-Tiles (HO * WO * CO) 
        '''
        self.AL1_Cc0: size.Size = size.B(W0_in * H0_in * C0)
        self.AL1_Cc0_Penalty: int = 1
        self.AL1_Penalty: int = 1

        AL1_info = {}

        for forLoopSymbol in reversed(
                workloadForLoopDescription):  # All the outer loops are penalty
            self.AL1_Penalty = self.AL1_Penalty * self.case.loopParameter.symbol_to_count(
                forLoopSymbol)

        AL1_info['Critical_Capacity'] = [self.AL1_Cc0]
        AL1_info['Penalty'] = [self.AL1_Cc0_Penalty, self.AL1_Penalty]
        self.c3p_info['AL1'] = AL1_info

        #####################################################################################################################

        ######################################################## AL2 ########################################################
        tileSize = TileSize(H0 * Hc, W0 * Wc)
        tileSize_in = tileSize.in_tile(self.case.workload.kernel_size,
                                       self.case.workload.stride)
        W_Cc2 = W0 * W1 * Wc
        H_Cc2 = H0 * H1 * Hc
        W_Cc3 = W0 * W1 * Wc * W2
        H_Cc3 = H0 * H1 * Hc * H2
        '''
        Cc0: At least once mapping for all cores
        Cc1: Can buffer all input channels
        Cc2: Can buffer all mini-tile
        Cc3: Can buffer all tiles  (e.g., some layers have very the small feature map size)

        But in fact, generate_case.py has guaranteed AL2 >= Cc2.
        The following functions prevent some exception cases that I have ignored.
        '''

        self.AL2_Cc0: size.Size = size.B(tileSize_in.size() * C0)
        self.AL2_Cc1_fake: size.Size = self.AL2_Cc0 * C1
        self.AL2_Cc2_fake = size.B(
            TileSize(H_Cc2, W_Cc2).in_tile(self.case.workload.kernel_size,
                                           self.case.workload.stride).size() *
            C0 * C1)
        self.AL2_Cc3_fake = size.B(
            TileSize(H_Cc3, W_Cc3).in_tile(self.case.workload.kernel_size,
                                           self.case.workload.stride).size() *
            C0 * C1)

        # The critical point is None if it doesn't make sense
        self.AL2_Cc1: Optional[size.Size] = None
        self.AL2_Cc2: Optional[size.Size] = None
        self.AL2_Cc3: Optional[size.Size] = None

        self.AL2_Cc0_Penalty: int = 1
        self.AL2_Cc1_Penalty: int = 1
        self.AL2_Cc2_Penalty: int = 1
        self.AL2_Cc3_Penalty: int = 1

        AL2_Cc2_Penalty_start = 0
        AL2_Cc3_Penalty_start = 0

        AL2_info = {}

        # Check AL2: Cc1 (Check whether critical points are meaningful)
        for forLoopSymbol in reversed(workloadForLoopDescription):
            if self.case.loopParameter.symbol_to_count(forLoopSymbol) != 1:
                # Find the first real loop
                if forLoopSymbol in IAImpactFactors:
                    # Cc1 doesn't make sense
                    pass
                else:
                    self.AL2_Cc1 = self.AL2_Cc1_fake
                break

        for forLoopSymbol in reversed(workloadForLoopDescription):
            if forLoopSymbol in IAImpactFactors:
                break
            else:
                self.AL2_Cc1_Penalty = self.AL2_Cc1_Penalty * self.case.loopParameter.symbol_to_count(
                    forLoopSymbol)

        # Check AL2: Cc2
        H1Index = workloadForLoopDescription.index(ForLoopSymbol.H1)
        if H1Index != 0:
            for forLoopSymbol in reversed(
                    workloadForLoopDescription[:H1Index]):
                if self.case.loopParameter.symbol_to_count(forLoopSymbol) != 1:
                    # Find the first real loop
                    if forLoopSymbol in IAImpactFactors:
                        # Cc2 doesn't make sense
                        pass
                    else:
                        self.AL2_Cc2 = self.AL2_Cc2_fake
                    break

        AL2_Cc2_Penalty_start = workloadForLoopDescription.index(
            forLoopSymbol.H1)
        if AL2_Cc2_Penalty_start > 0:
            for forLoopSymbol in reversed(
                    workloadForLoopDescription[:AL2_Cc2_Penalty_start]):
                if forLoopSymbol in IAImpactFactors:
                    break
                else:
                    self.AL2_Cc2_Penalty = self.AL2_Cc2_Penalty * self.case.loopParameter.symbol_to_count(
                        forLoopSymbol)

        # Check AL2: T3
        H2Index = workloadForLoopDescription.index(ForLoopSymbol.H2)
        if H2Index != 0:
            for forLoopSymbol in reversed(
                    workloadForLoopDescription[:H2Index]):
                if self.case.loopParameter.symbol_to_count(forLoopSymbol) != 1:
                    # Find the first real loop
                    if forLoopSymbol in IAImpactFactors:
                        # T3_n doesn't make sense
                        pass
                    else:
                        self.AL2_Cc3 = self.AL2_Cc3_fake
                    break

        AL2_Cc3_Penalty_start = workloadForLoopDescription.index(
            forLoopSymbol.H2)
        if AL2_Cc3_Penalty_start > 0:
            for forLoopSymbol in reversed(
                    workloadForLoopDescription[:AL2_Cc3_Penalty_start]):
                if forLoopSymbol in IAImpactFactors:
                    break
                else:
                    self.AL2_Cc3_Penalty = self.AL2_Cc3_Penalty * self.case.loopParameter.symbol_to_count(
                        forLoopSymbol)

        AL2_info['Critical_Capacity'] = [
            self.AL2_Cc0, self.AL2_Cc1, self.AL2_Cc2, self.AL2_Cc3
        ]
        AL2_info['Penalty'] = [
            self.AL2_Cc0_Penalty, self.AL2_Cc1_Penalty, self.AL2_Cc2_Penalty,
            self.AL2_Cc3_Penalty
        ]
        self.c3p_info['AL2'] = AL2_info
        self.Cc_fake['AL2'] = [
            self.AL2_Cc1_fake, self.AL2_Cc2_fake, self.AL2_Cc3_fake
        ]
        #####################################################################################################################

        return self.c3p_info, self.Cc_fake
Ejemplo n.º 11
0
def refresh_conf(actbit: float, wetbit: float):
    global num_cores
    global num_lanes
    global W_BW_RATIO
    global A_BW_RATIO
    global DATA_WIDTH
    global AL2_choices
    global AL1_choices
    global WL1_choices
    global WEIGHT_WIDTH
    global num_chiplets
    global size_vectors
    global OL1_choices_map
    global ACT_MEMORY_ALIGN

    DATA_WIDTH = actbit
    WEIGHT_WIDTH = wetbit
    A_BW_RATIO = DATA_WIDTH / 8
    W_BW_RATIO = WEIGHT_WIDTH / 8

    if DATA_WIDTH == 16:
        ACT_MEMORY_ALIGN = 8
    elif DATA_WIDTH == 8 or DATA_WIDTH == 4:
        ACT_MEMORY_ALIGN = 16
    else:
        raise ValueError('The DATA-WIDTH only supports 4, 8, and 16bit')

    if DATA_WIDTH == 16 and WEIGHT_WIDTH == 8:
        num_chiplets = [4]
        num_cores = [8]
        num_lanes = [8]
        size_vectors = [8]
        # 16b-A, 8b-W Mode Configurations:
        AL2_choices = [size.B(23040)]
        AL1_choices = [size.B(4096)]
        WL1_choices = [size.B(1168)]
        # TODO: MUSE-V3 can support more basic sizes
        OL1_choices_map = {
            size.B(3): TileSize(1, 1),
            size.B(12): TileSize(2, 2),
            size.B(48): TileSize(4, 4),
            size.B(192): TileSize(8, 8)
        }
    elif DATA_WIDTH == 8 and WEIGHT_WIDTH == 8:
        num_chiplets = [4]
        num_cores = [8]
        num_lanes = [16]
        size_vectors = [8]
        # 8b-A, 8b-W Mode Configurations:
        AL2_choices = [size.B(46080)]
        AL1_choices = [size.B(8192)]
        WL1_choices = [size.B(1168)]
        OL1_choices_map = {
            size.B(3): TileSize(1, 1),
            size.B(12): TileSize(2, 2),
            size.B(48): TileSize(4, 4),
            size.B(192): TileSize(8, 8)
        }
    elif DATA_WIDTH == 8 and WEIGHT_WIDTH == 4:
        num_chiplets = [4]
        num_cores = [8]
        num_lanes = [16]
        size_vectors = [8]
        # 8b-A, 4b-W Mode Configurations:
        AL2_choices = [size.B(46080)]
        AL1_choices = [size.B(8192)]
        WL1_choices = [size.B(2336)]
        OL1_choices_map = {
            size.B(3): TileSize(1, 1),
            size.B(12): TileSize(2, 2),
            size.B(48): TileSize(4, 4),
            size.B(192): TileSize(8, 8)
        }
    elif DATA_WIDTH == 4 and WEIGHT_WIDTH == 4:
        num_chiplets = [4]
        num_cores = [8]
        num_lanes = [16]
        size_vectors = [16]
        # 4b-A, 4b-W Mode Configurations:
        AL2_choices = [size.B(92160)]
        AL1_choices = [size.B(16384)]
        WL1_choices = [size.B(2336)]
        OL1_choices_map = {
            size.B(3): TileSize(1, 1),
            size.B(12): TileSize(2, 2),
            size.B(48): TileSize(4, 4),
            size.B(192): TileSize(8, 8)
        }