Exemple #1
0
                            def add_kernel_data(ll, p, col_target, b):
                                col = kern_offs[ll] + col_target
                                if col >= tc.dev.mask_width(p):
                                    eprint(
                                        f'\nKernel memory exceeded in layer {ll}.'
                                        '\n\nKernel map so far:')
                                    print_map(layers,
                                              kernel_map,
                                              print_fn=eprint_noprefix)
                                    sys.exit(1)

                                if kernels_used[p][
                                        col] == 0:  # Update kernel map
                                    assert kernel_map[p][col] == _INVALID_VALUE
                                    kernel_map[p][col] = ll

                                assert kernels_used[p][col] <= 8
                                kernel_data[p][col][
                                    8 - kernels_used[p][col]] = b & 0xff
                                kernels_used[p][col] += 1

                                if kernels_used[p][col] == 9:  # Flush
                                    col_target += 1  # Write 1

                                return col_target
Exemple #2
0
 def check_overwrite(
     p,
     target_offs,
     in_map,
     out_map,
     c,
     row,
     col,
 ):
     # If using single layer, make sure we're not overwriting the input
     if (not overwrite_ok) and in_map[target_offs >> 2] is not None:
         old_ll, old_c, old_row, old_col, _ = in_map[target_offs >> 2]
         eprint(
             f'Processor {p}: '
             f'Layer {ll} output for CHW={c},{row},{col} is overwriting '
             f'input at offset 0x{target_offs:08x} that was created by '
             f'layer {old_ll}, CHW={old_c},{old_row},{old_col}.',
             error=not no_error_stop)
         if not no_error_stop:
             sys.exit(1)
     # Check we're not overflowing the data memory
     if (not overwrite_ok) and out_map is not None and out_map[
             target_offs >> 2] is not None:
         old_ll, old_c, old_row, old_col, old_val = out_map[target_offs
                                                            >> 2]
         eprint(
             f'Processor {p}: '
             f'Layer {ll} output for CHW={c},{row},{col} is overwriting '
             f'offset 0x{target_offs:08x}. Previous write by '
             f'layer {old_ll},CHW={old_c},{old_row},{old_col} with value 0x{old_val:08x}.',
             error=not no_error_stop)
         if not no_error_stop:
             sys.exit(1)
Exemple #3
0
    def construct_mapping(self, node, deep=False):
        if not isinstance(node, yaml.MappingNode):
            raise yaml.constructor.ConstructorError(
                None, None, "Expected a mapping node, but found %s" % node.id,
                node.start_mark)

        mapping = {}
        for key_node, value_node in node.value:
            key = self.construct_object(key_node, deep=deep)
            try:
                hash(key)
            except TypeError as exc:
                eprint(f'Found unacceptable key {exc} {key_node.start_mark} '
                       f'while constructing a mapping {node.start_mark}')
                sys.exit(1)

            # check for duplicate keys
            if key in mapping:
                eprint(f'Found duplicate key {key} '
                       f'while constructing a mapping{node.start_mark}')
                sys.exit(1)
            value = self.construct_object(value_node, deep=deep)
            mapping[key] = value

        return mapping
Exemple #4
0
 def error_exit(message, sequence):
     """
     Print error message `message` for layer sequence `sequence` and exit.
     """
     eprint(
         f'{message} (found in layer sequence {sequence} in YAML configuration).'
     )
     sys.exit(1)
Exemple #5
0
 def check_filename(self):
     if self.filename[-2:] != '.s':
         eprint('error: non-assembly file passed:', self.filename)
         sys.exit(1)
     elif len(self.filename) >= len('.cdi.s') and \
          self.filename[-6:] == '.cdi.s':
         eprint('error: cdi-assembly file passed:', self.filename)
         sys.exit(1)
Exemple #6
0
def makeDate(datestring):
    d = datetime.utcnow()

    try:
        d = parser.parse(datestring)
    except ValueError:
        if verbose > 0:
            eprint("makeDate: %s is not a valid datetime" % datestring)

    return str(d.strftime("%Y-%m-%dT%H:%M:%S.%f"))
 def check_overwrite(
         self,
         offs,
 ):
     """
     Check whether we're overwriting location `offs`.
     """
     if self.mem[offs >> 2]:
         eprint(f'Overwriting location {offs:08x}', error=not self.no_error_stop)
         if not self.no_error_stop:
             sys.exit(1)
Exemple #8
0
def main():
    init()

    while True:
        (rv, original) = capture.read()
        if (not rv): break

        fps.incrementFrames()

        # Rotate the image, if the camera is on its side
        if (camera_rotation):
            original = np.rot90(original, camera_rotation)

        # Downscale image to make findtheface() faster
        img = cv2.resize(original, frame_downscale)
        imgscale = float(original.shape[0]) / img.shape[0]

        # Find the face and eyes using the Haar cascade
        (face, eyes) = findtheface(img)

        # If both are found, center on the eyes and scale
        if (face is not None and eyes is not None):
            img = centerandscale(img, face[0], eyes[0])

        # Crop to the proper aspect ratio
        img = crop(img)

        # Show the image (and frames per second)
        cv2.imshow(title, cv2.flip(img, 1))
        if (fps.framecount % 10 == 0):
            eprint('%.2f fps' % fps.getFPS(), end='\r')
            fps.reset()

        # Show image and wait for a key
        c = chr(cv2.waitKey(1) & 0xFF)

        if (c == 'q' or c == '\x1b'):  # q or ESC to quit
            break
        if (c == ' ' or c == 'p' or c == 's'):  # Print a screenshot
            img = centerandscale(original, np.dot(face[0], imgscale),
                                 np.dot(eyes[0], imgscale))
            img = crop(img)
            cv2.imwrite("passport.jpg", img)
            print("Wrote image to passport.jpg")

    capture.release()
    cv2.destroyAllWindows()
Exemple #9
0
def convert_to_cdi(site, funct, asm_line, asm_dest, cfg, sled_id_faucet,
                   dwarf_loc, options, functs):
    """Converts asm_line to cdi compliant code then writes it to asm_dest"""

    if site.group == site.CALL_SITE:
        convert_call_site(site, funct, asm_line, asm_dest, sled_id_faucet,
                          dwarf_loc, options, functs)
    elif site.group == site.RETURN_SITE:
        convert_return_site(site, funct, asm_line, asm_dest, cfg,
                            sled_id_faucet, dwarf_loc, options, functs)
    elif site.group == site.INDIR_JMP_SITE:
        convert_indir_jmp_site(site, funct, asm_line, asm_dest)
    elif site.group == site.PLT_SITE:
        register_file_lines(asm_line, funct.asm_filename)
    else:
        eprint('warning: site has invalid type: line ' + site.asm_line_num,
               'in function named \'' + funct.asm_name + '\'')
Exemple #10
0
def get_device(device, ):
    """
    Change implementation configuration to match, depending on the `device`
    integer input value.
    """
    part = devices.partnum(device)
    print('Configuring device:', part)

    if device == 84:
        d = DevAI84(part)
    elif device == 85:
        d = DevAI85(part)
    elif device == 87:
        d = DevAI85(part)  # For now, no differences from AI85
    else:
        eprint(f'Unknown device code `{device}`')
        sys.exit(1)

    return d
Exemple #11
0
def relauncher_main(argv=None):
    if argv == None:
        argv = sys.argv

    if(verbose_enabled(argv)):
        print("Env.relauncher_main called with {}".format(str(argv)))

    #find gnucash-env, fail if we can't find it
    gnucash_env = find_prog("gnucash-env")
    if gnucash_env == None:
        eprint("Could not find gnucash-env!  Is GnuCash correctly installed?")
        sys.exit(1)

    #need to pass the --no-relaunch flag to make sure we don't get stuck in an infinite loop
    #this will make choose_main run accregex_main instead of relauncher_main
    additional_args = shlex.split("python2 -maccregex --no-relaunch")
    new_argv = additional_args + argv

    relaunch(gnucash_env, new_argv)
Exemple #12
0
def relauncher_main(argv=None):
    if argv == None:
        argv = sys.argv

    if (verbose_enabled(argv)):
        print("Env.relauncher_main called with {}".format(str(argv)))

    #find gnucash-env, fail if we can't find it
    gnucash_env = find_prog("gnucash-env")
    if gnucash_env == None:
        eprint("Could not find gnucash-env!  Is GnuCash correctly installed?")
        sys.exit(1)

    #need to pass the --no-relaunch flag to make sure we don't get stuck in an infinite loop
    #this will make choose_main run accregex_main instead of relauncher_main
    additional_args = shlex.split("python2 -maccregex --no-relaunch")
    new_argv = additional_args + argv

    relaunch(gnucash_env, new_argv)
Exemple #13
0
def build_ret_dicts(cfg):
    """Builds return dictionaries of all functions in the CFG

    Notice that when a given function is being examined, it is all the other
    functions' return dictionaries that are being built. After all, a function 
    foo's return dictionary depends on which functions call foo
    """
    arbitrary_ftype = funct_cfg.FunctionType.arbitrary
    beg_multiplicity = 1

    for funct in cfg:
        call_dict = dict()
        for site in funct.sites:
            if site.group == site.CALL_SITE:
                for target in site.targets:
                    increment_dict(call_dict, target.uniq_label,
                                   beg_multiplicity)

        for target_label, multiplicity in call_dict.iteritems():
            try:
                cfg.funct(target_label).ret_dict[funct.uniq_label] = \
                    multiplicity
            except KeyError:
                eprint("warning: function cannot be found: " + target_label)
Exemple #14
0
def play(
    *,
    media,
    verbose: Union[bool, int, float],
    novideo: bool = False,
    noaudio: bool = False,
    subtitles: bool = False,
    loop: bool = False,
    skip_ahead: Optional[float] = None,
    ban_clipboard: bool = False,
    fullscreen: bool = False,
):

    global QUIT
    QUIT = False
    global BAN
    BAN = False
    media = Path(media).absolute()
    ic(media.as_posix())

    if check_for_banned_hash(
            media=media,
            verbose=verbose,
    ):
        return

    # assert 'sources' in media.parts
    try:
        chan = extract_chan(
            path=media,
            verbose=verbose,
        )
    except ValueError:
        chan = None

    video = not novideo
    audio = not noaudio  # todo
    if video:
        video = "auto"

    player = mpv.MPV(
        log_handler=logger,
        input_default_bindings=True,
        terminal=True,
        input_terminal=True,
        input_vo_keyboard=True,
        # script_opts='osc-layout=bottombar,osc-seekbarstyle=bar,osc-deadzonesize=0,osc-minmousemove=3',
        # script_opts='osc-layout=bottombar',
        osd_bar=False,
        scripts="/home/user/.config/mpv/osc_seek.lua",
        # osc=True,
        video=video,
    )

    # self.m = mpv.MPV(vo='x11')
    # ic(get_current_virtural_terminal())

    if not in_xorg(verbose=verbose):
        player.vo = "drm"
        player.gpu_context = "auto"
    else:
        player.vo = "gpu"
        player.hwdec = "vaapi"

    if fullscreen:
        player.fullscreen = True

    if loop:
        # player.loop_playlist = 'inf'
        player.loop_file = "inf"

    if subtitles:
        player.sub = "yes"
    else:
        player.sub = "no"

    # if skip_ahead:
    #    player.start(skip_ahead)

    # https://github.com/jaseg/python-mpv/issues/122
    # player.on_key_press('ESC')(player.quit)
    # player.on_key_press('ENTER')(lambda: player.playlist_next(mode='force'))

    @player.on_key_press("Alt+i")
    def my_alt_i_binding():
        # ic('Alt+i works')
        media_ext = media.name.split(".")[-1]
        # ic(media_ext)
        # if media_ext:
        try:
            media_json_file = media.as_posix().replace("." + media_ext,
                                                       ".info.json")
            ic(media_json_file)
            url = jsonparser(path=media_json_file, key="webpage_url")
            ic(url)
        except (UnicodeDecodeError, PermissionError):
            # ic(e)  # nope, will print the binary that was not json
            url = None

        if url:
            put_clipboard(
                url,
                verbose=verbose,
            )
            if os.getuid() == 0:
                os.system('su user -c "/home/user/bin/spider-iri 1" &')
            else:
                os.system("/home/user/bin/spider-iri 1 &")
        else:
            if os.getuid() == 0:
                os.system("su user -c \"/usr/bin/iridb import '{}'\"".format(
                    media.as_posix()))
            else:
                os.system("/usr/bin/iridb import {}".format(media.as_posix()))
        ic("done with Alt+i routine")

    @player.on_key_press("Meta+i")
    def my_meta_i_binding():
        ic("Meta+i works")

    @player.on_key_press("D")
    def my_D_binding():
        ic("D works")
        os.system("mv -vi " + '"' + media.as_posix() + '"' + " /delme/")

    @player.on_key_press("B")
    def my_B_binding():
        global BAN
        BAN = True
        ic("banning:", chan)
        # player.terminate()
        player.quit()
        # pillow_img = player.screenshot_raw()
        # pillow_img.save('screenshot.png')

    @player.on_key_press("L")
    def my_L_binding():
        global PLAY_LATER
        PLAY_LATER = True
        ic("PLAY_LATER:", chan)
        player.quit()

    @player.on_key_press("Meta+L")
    def my_meta_L_binding():
        global PLAY_LATER
        PLAY_LATER = True
        ic("PLAY_LATER:", chan)
        player.quit()

    # player.on_key_press('ENTER')(lambda: player.playlist_next(mode='force'))
    @player.on_key_press("ENTER")
    def my_enter_keybinding():
        ic()
        player.playlist_next(mode="force")

    # ESC must be pressed 2x if the focus is on the terminal due to mpv design:
    # https://github.com/jaseg/python-mpv/issues/122
    player.on_key_press("ESC")(player.quit)
    player.register_key_binding("INS", "seek 5")

    # @player.on_key_press('ESC')
    # def my_esc_binding():
    #    #player.quit()
    #    global QUIT
    #    QUIT = True
    #    player.terminate()

    try:
        player.play(media.as_posix())
        # https://github.com/jaseg/python-mpv/issues/79
        if skip_ahead:
            player.wait_for_property("seekable")
            player.seek(skip_ahead, reference="absolute", precision="exact")
        player.wait_for_playback()
    except mpv.ShutdownError:
        eprint("\nmpv.ShutdownError\n")
        player.terminate()
        if BAN:
            if ban_clipboard:
                clipboard = get_clipboard(
                    one_line=True,
                    verbose=verbose,
                )
                ic("raising BanClipboardError:", clipboard)
                raise BanClipboardError(clipboard)
            else:
                ic("raising BanChanError:", chan)
                raise BanChanError(chan)

        if PLAY_LATER:
            ic("raising PlayChanLaterError:", chan)
            raise PlayChanLaterError(chan)

        raise StopPlayingError
        # pass

    ic("calling player.terminate()")
    player.terminate()
Exemple #15
0
def load(  # pylint: disable=too-many-branches,too-many-statements
    verbose,
    embedded_code,
    device,
    apb,
    start_layer,
    layers,
    operator,
    kernel,
    kernel_size,
    quantization,
    processor_map,
    output_processor_map,
    input_chan,
    output_chan,
    out_expand,
    out_expand_thresh,
    in_expand,
    in_expand_thresh,
    flatten=False,
    mexpress=False,
    verify=False,
    riscv_flash=False,
    quad=False,
    debug=False,
    blocklevel=False,
    legacy_kernels=False,
    calcx4=False,
):
    """
    Stack `kernel` values and write them to C code (for `embedded_code` if `True` or
    RTL simulation). The output is written to the `apb` object.
    Input is configured with `kernel_size`, `quantization`, `layers`, `processor_map`,
    `output_processor_map`, `input_chan`, `output_chan`, `out_expand` and `out_expand_thresh`.
    When `mexpress` is `True`, the function uses the memcpy()-friendly hardware functionality to
    reduce the number of transfers. When `verify` is also true (mexpress mode only), kernels are
    read back and compared.
    This function returns the kernel offsets and the kernel lengths for all layers.
    """
    # Kernels: Stack kernels; write only the kernels needed
    proc_kern_max = [0] * tc.dev.MAX_PROC
    kern_offs = [0] * layers
    kern_len = [0] * layers
    kernel_map = np.full((tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE),
                         _INVALID_VALUE,
                         dtype=np.int64)
    kernels_used = np.zeros((tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE),
                            dtype=np.int64)
    kernel_data = np.zeros((tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE, 9),
                           dtype=np.int8)
    # There are four 32-bit words per 9-byte kernel.
    # The value map is initialized with zeros so we can later ignore unused entries and use
    # memcpy() on initialized and uninitialized data.
    kernel_values = np.zeros(
        (tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE * _WORDS_PER_KERNEL),
        dtype=np.int64)
    if debug:
        print('\nLoading Kernels...')

    if calcx4 and not tc.dev.SUPPORT_CALCX4:
        eprint('--calcx4 is not supported on this device.')
        sys.exit(1)
    assert not (
        (embedded_code or mexpress) and calcx4)  # FIXME Add support later

    for ll in range(start_layer, layers):
        if operator[ll] not in [op.CONV1D, op.CONV2D, op.CONVTRANSPOSE2D]:
            kern_len[ll] = 0
            kern_offs[ll] = 0
            continue

        if flatten[ll]:
            kernel_reshaped = kernel[ll].reshape(
                output_chan[ll] * input_chan[ll],
                -1,
                kernel_size[ll][0],
                kernel_size[ll][1],
            )
        else:
            kernel_reshaped = kernel[ll]

        first_proc = ffs(processor_map[ll])
        last_proc = fls(processor_map[ll])
        ch = 0
        m = 0
        for p in range(first_proc, last_proc + 1):
            if (processor_map[ll] >> p) & 1 == 0:
                # Unused processor
                continue
            # Get highest offset for all used processors
            kern_offs[ll] = max(proc_kern_max[p], kern_offs[ll])

        ksize = kernel_size[ll][0] * kernel_size[ll][1]
        qfactor = 8 // quantization[ll]
        # Determine the number of kernels that need to be programmed. Since each instance
        # spans 4 processors, kernels for all instances that have a single processor enabled
        # need to be written, i.e. round down the first. The last does not need to be rounded
        # up because hardware takes care of it.
        next_layer_map = output_processor_map[ll]
        # When using kernels smaller than 8 bit, round up to the next 8-bit boundary
        # Gaps are accounted for like any other kernel.
        kern_len[ll] = 1 + quantization[ll] * \
            (fls(next_layer_map) - ffs(next_layer_map)) // 8
        # This extends the kernels to the right on AI85 for input and output expansion
        if output_chan[ll] > tc.dev.MAX_PROC:
            kern_len[ll] = (kern_len[ll] + tc.dev.P_SHARED -
                            1) & ~(tc.dev.P_SHARED - 1)
        kern_len[ll] *= out_expand[ll] * in_expand[ll]
        if not legacy_kernels and flatten[ll]:
            kern_len[ll] *= kernel_reshaped.shape[1]
            kern_len[ll] -= (out_expand[ll] * popcount(next_layer_map) - output_chan[ll]) \
                * kernel_reshaped.shape[1] * 8 // (ksize * quantization[ll])
        if device != 84:
            # Pack kernels when using 1D convolutions, or 1x1 kernels
            kern_len[ll] = (kern_len[ll] * ksize + 8) // 9
        if ll == 0 and quad:
            kern_len[0] = (kern_len[0] + 3) // 4

        # We don't have to use dummy columns if there's space available on the left
        kern_offs[ll] = \
            max(0, kern_offs[ll] - (((ffs(next_layer_map) % tc.dev.P_SHARED)
                                     + qfactor - 1) // qfactor))
        # The kernel offset needs to start at a multiple of 4.
        kern_offs[ll] = (kern_offs[ll] + tc.dev.P_SHARED -
                         1) & ~(tc.dev.P_SHARED - 1)
        if kern_offs[ll] + kern_len[ll] > tc.dev.mask_width(p):
            eprint(
                f'\nKernel memory exceeded at layer {ll}; offset: {kern_offs[ll]}, '
                f'needed: {kern_len[ll]}.'
                '\n\nKernel map so far:')
            print_map(layers, kernel_map, print_fn=eprint_noprefix)
            sys.exit(1)

        proc_mask = 2**qfactor - 1
        # Start at the first used instance
        this_map_init = next_layer_map >> ffs(next_layer_map)
        start_col = ffs(
            next_layer_map) % tc.dev.P_SHARED  # First target column

        for p in range(first_proc, last_proc + 1):
            if (processor_map[ll] >> p) & 1 == 0:
                # Unused source processor
                continue
            col_target = start_col
            for expand in range(out_expand[ll]):
                this_map = this_map_init
                if ll == 0 and quad:
                    col = expand * (out_expand_thresh[ll] + 3) // 4
                    stop_col = col + (out_expand_thresh[ll] + 3) // 4
                else:
                    col = expand * out_expand_thresh[ll]
                    stop_col = col + out_expand_thresh[ll]
                while col < stop_col:
                    # Skip over unused bits in the target processor map
                    # (unused means 1 bit for 8-bit weights, 2 for 4-bit weights, etc.)
                    if this_map != 0:
                        while this_map & proc_mask == 0:
                            assert this_map != 0
                            col_target += 1  # Completely skip
                            this_map >>= qfactor  # and slide forward
                    this_mask = this_map & proc_mask
                    this_map >>= qfactor

                    if ll == 0 and quad:
                        src_offs = ch + (m - p // 16) * input_chan[ll]
                    else:
                        src_offs = ch + m * input_chan[ll]
                    if ll > 0 or not quad or (m % 4 == p // 16):
                        for ie in range(in_expand[ll]):
                            mask = this_mask

                            def add_kernel_data(ll, p, col_target, b):
                                col = kern_offs[ll] + col_target
                                if col >= tc.dev.mask_width(p):
                                    eprint(
                                        f'\nKernel memory exceeded in layer {ll}.'
                                        '\n\nKernel map so far:')
                                    print_map(layers,
                                              kernel_map,
                                              print_fn=eprint_noprefix)
                                    sys.exit(1)

                                if kernels_used[p][
                                        col] == 0:  # Update kernel map
                                    assert kernel_map[p][col] == _INVALID_VALUE
                                    kernel_map[p][col] = ll

                                assert kernels_used[p][col] <= 8
                                kernel_data[p][col][
                                    8 - kernels_used[p][col]] = b & 0xff
                                kernels_used[p][col] += 1

                                if kernels_used[p][col] == 9:  # Flush
                                    col_target += 1  # Write 1

                                return col_target

                            n = 0
                            if src_offs < len(kernel_reshaped):
                                if not flatten[ll]:
                                    k = np.zeros_like(
                                        kernel_reshaped[src_offs].flatten())
                                    for i in range(qfactor):
                                        if m < output_chan[ll]:
                                            # Cycle through phases
                                            idx = n + ie * qfactor
                                            koffs = src_offs + (idx % in_expand[ll]) \
                                                * in_expand_thresh[ll] \
                                                + (idx // in_expand[ll]) \
                                                * input_chan[ll]
                                            if koffs < len(kernel_reshaped):
                                                this_kern = kernel_reshaped[koffs].flatten() \
                                                    & (2**quantization[ll]-1)
                                                k |= this_kern << (
                                                    i * quantization[ll])
                                            n += 1
                                        mask >>= 1
                                else:
                                    kl = (len(kernel_reshaped[src_offs]) +
                                          qfactor - 1) // qfactor
                                    k = np.zeros(kl, dtype=np.int64)
                                    if m < output_chan[ll]:
                                        # Cycle through phases
                                        idx = n + ie * qfactor
                                        koffs = src_offs + (idx % in_expand[ll]) \
                                            * in_expand_thresh[ll] \
                                            + (idx // in_expand[ll]) \
                                            * input_chan[ll]
                                        if koffs < len(kernel_reshaped):
                                            this_kern = kernel_reshaped[
                                                koffs].flatten()
                                            if len(this_kern) % qfactor != 0:
                                                this_kern = np.append(
                                                    this_kern,
                                                    np.zeros(qfactor -
                                                             len(this_kern) %
                                                             qfactor,
                                                             dtype=np.int64))
                                            for i in range(qfactor):
                                                k |= ((this_kern[i::qfactor]
                                                       & (2**quantization[ll]-1))) \
                                                    << (i * quantization[ll])
                                        n += 1
                                        mask >>= 1
                                if debug:
                                    with np.printoptions(
                                            formatter={
                                                'int': '{0:02x}'.format
                                            }):
                                        print(
                                            f'Layer {ll} processor {p} channel '
                                            f'{ch + ie * in_expand_thresh[ll]} m[{m}..{m+n-1}] '
                                            f'of {output_chan[ll]}: {k}')

                                if flatten[ll]:
                                    for _, e in enumerate(k):
                                        col_target = add_kernel_data(
                                            ll, p, col_target, e)
                                else:
                                    for i in range(ksize):
                                        col_target = add_kernel_data(
                                            ll, p, col_target,
                                            k[ksize - i - 1])

                            else:  # When expanding, need to pad with zero kernels if needed
                                for _ in range(ksize // qfactor):
                                    col_target = add_kernel_data(
                                        ll, p, col_target, 0)

                        # Consume kernels
                        if not flatten[ll]:
                            col += qfactor
                            m += qfactor
                        else:
                            col += 1
                            m += 1
                    else:
                        m += qfactor

            if kern_offs[ll] + col_target < tc.dev.mask_width(p) \
               and kernels_used[p][kern_offs[ll] + col_target] > 0:  # Partials
                col_target += 1
            while col_target - start_col < kern_len[ll]:
                col_target = add_kernel_data(ll, p, col_target, 0)
            if flatten[ll]:
                kern_len[ll] = col_target
            else:
                assert kern_len[ll] == col_target - start_col
            proc_kern_max[p] = kern_offs[ll] + kern_len[ll]
            ch += 1
            m = 0

    if verbose:
        print('\nKernel map:')
        print_map(layers, kernel_map)

    if verify or not (embedded_code or mexpress):
        if verify:
            apb.output('int verify_kernels(void)\n{\n')
        # Write in-line
        for p in range(tc.dev.MAX_PROC):
            for col in range(0, tc.dev.mask_width(p)):
                ll = kernel_map[p][col]
                if ll != _INVALID_VALUE:
                    k = kernel_data[p][col]
                    apb.write_kern(ll,
                                   p,
                                   col,
                                   k,
                                   verify_only=verify,
                                   calcx4=calcx4)
        if verify:
            apb.output('  return 1;\n}\n\n')
    if embedded_code or mexpress:
        # Write kernels, combining layers and processors where possible to reduce the number
        # of constants and calls to memcpy.
        apb.output('// Kernels:\n')

        if not mexpress:
            for p in range(tc.dev.MAX_PROC):
                for col in range(0, tc.dev.mask_width(p)):
                    ll = kernel_map[p][col]
                    if ll != _INVALID_VALUE:
                        k = kernel_data[p][col]
                        offs = _WORDS_PER_KERNEL * col
                        kernel_values[p][offs] = k[0] & 0xff
                        kernel_values[p][offs + 1] = (k[1] & 0xff) << 24 \
                            | (k[2] & 0xff) << 16 | (k[3] & 0xff) << 8 | k[4] & 0xff
                        kernel_values[p][offs + 2] = (k[5] & 0xff) << 24 \
                            | (k[6] & 0xff) << 16 | (k[7] & 0xff) << 8 | k[8] & 0xff

            # First, define the weights (will move to header file)
            # Combining memcopy() requires stacked memories
            max_col = [-1] * tc.dev.MAX_PROC
            min_col = [tc.dev.MASK_WIDTH_LARGE if not legacy_kernels else 0
                       ] * tc.dev.MAX_PROC
            for p in range(0, tc.dev.MAX_PROC):
                for col in range(0, tc.dev.mask_width(p)):
                    ll = kernel_map[p][col]
                    if ll != _INVALID_VALUE:
                        max_col[p] = col
                        min_col[p] = min(min_col[p], col)
            p = 0
            while p < tc.dev.MAX_PROC:
                if max_col[p] >= 0:
                    start = p
                    while (max_col[p] == tc.dev.MASK_OFFS
                           and p + 1 < tc.dev.MAX_PROC and max_col[p + 1] >= 0
                           and min_col[p + 1] == 0
                           and (start & ~(tc.dev.P_NUMPRO - 1))
                           == (p + 1 & ~(tc.dev.P_NUMPRO - 1))):
                        p += 1
                    # Combine multiple channels into one define
                    k = None
                    for i in range(start, p + 1):
                        if k is None:
                            k = kernel_values[i][min_col[i] *
                                                 _WORDS_PER_KERNEL:
                                                 (max_col[i] + 1) *
                                                 _WORDS_PER_KERNEL]
                        else:
                            k = np.concatenate(
                                (k, kernel_values[i]
                                 [min_col[i] *
                                  _WORDS_PER_KERNEL:(max_col[i] + 1) *
                                  _WORDS_PER_KERNEL]))

                    apb.output_define(k, f'KERNELS_{start}', '0x%08x', 8)
                p += 1

            # Second, initialize static const variables as source for memcpy
            p = 0
            while p < tc.dev.MAX_PROC:
                if max_col[p] >= 0:
                    span = max_col[p] + 1 - min_col[p]
                    start = p
                    while (max_col[p] == tc.dev.MASK_OFFS
                           and p + 1 < tc.dev.MAX_PROC and max_col[p + 1] >= 0
                           and min_col[p + 1] == 0
                           and (start & ~(tc.dev.P_NUMPRO - 1))
                           == (p + 1 & ~(tc.dev.P_NUMPRO - 1))):
                        p += 1
                        span += max_col[p] + 1 - min_col[p]
                    if riscv_flash:
                        apb.output(rv.RISCV_FLASH)
                    apb.output(
                        f'static const uint32_t kernels_{start}[] = KERNELS_{start};\n'
                    )
                p += 1
            apb.output('\n')

            # Generate code to load the weights using memcpy
            apb.output(
                'void memcpy_96to128(uint32_t *dst, const uint32_t *src, int n)\n{\n'
            )
            apb.output('  while (n-- > 0) {\n'
                       '    *dst++ = *src++;\n'
                       '    *dst++ = *src++;\n'
                       '    *dst++ = *src++;\n'
                       '    *dst++ = 0;  // Execute write\n'
                       '  }\n}\n\n')
        else:
            # When using the express loader, gather all consecutive kernels for each processor
            # and pack them.
            zero_kernel = np.array([0] * 9, dtype=np.uint8)
            k = None

            for p in range(tc.dev.MAX_PROC):
                # Find min/max from kernel_map
                max_col = -1
                min_col = tc.dev.mask_width(p) if not legacy_kernels else 0
                for col in range(0, tc.dev.mask_width(p)):
                    ll = kernel_map[p][col]
                    if ll != _INVALID_VALUE:
                        max_col = col
                        min_col = min(min_col, col)
                if max_col >= 0:
                    for col in range(min_col, max_col + 1):
                        ll = kernel_map[p][col]
                        if ll != _INVALID_VALUE:
                            new_k = (kernel_data[p][col] & 0xff).astype(
                                np.uint8)
                        else:
                            new_k = zero_kernel
                        if k is None:
                            k = new_k
                        else:
                            k = np.concatenate((k, new_k))

                    # Round up to multiple of 4
                    if len(k) % 4 != 0:
                        k = np.concatenate((k, zero_kernel[:4 - len(k) % 4]))
                    # '>u4' swaps endianness to what the hardware needs, `view` packs into 32-bit
                    if not blocklevel:
                        apb.output_define(k.view(dtype='>u4'), f'KERNELS_{p}',
                                          '0x%08x', 8)
                    else:
                        addr = tc.dev.C_GROUP_OFFS * (p // tc.dev.P_NUMPRO) \
                            + tc.dev.C_MRAM_BASE + (p % tc.dev.P_NUMPRO) * tc.dev.MASK_OFFS * 16
                        apb.write(addr + min_col * 4 | 0x01, 0x01)
                        kb = k.view(dtype=">u4")
                        for _, e in enumerate(kb):
                            apb.write(addr, e)
                            addr += 4

                    if riscv_flash:
                        apb.output(rv.RISCV_FLASH)
                    apb.output(
                        f'static const uint32_t kernels_{p}[] = KERNELS_{p};\n'
                    )
                    k = None
            apb.output('\n')

        if not blocklevel:
            apb.output('void load_kernels(void)\n{\n')
            max_col = [-1] * tc.dev.MAX_PROC
            min_col = [tc.dev.MASK_WIDTH_LARGE if not legacy_kernels else 0
                       ] * tc.dev.MAX_PROC
            for p in range(0, tc.dev.MAX_PROC):
                for col in range(0, tc.dev.mask_width(p)):
                    ll = kernel_map[p][col]
                    if ll != _INVALID_VALUE:
                        max_col[p] = col
                        min_col[p] = min(min_col[p], col)
            p = 0
            while p < tc.dev.MAX_PROC:
                if max_col[p] >= 0:
                    span = max_col[p] + 1 - min_col[p]
                    start = p
                    addr = apb.apb_base + tc.dev.C_GROUP_OFFS * (p // tc.dev.P_NUMPRO) \
                        + tc.dev.C_MRAM_BASE + (p % tc.dev.P_NUMPRO) * tc.dev.MASK_OFFS * 16
                    while (max_col[p] == tc.dev.MASK_OFFS
                           and p + 1 < tc.dev.MAX_PROC and max_col[p + 1] >= 0
                           and min_col[p + 1] == 0
                           and (start & ~(tc.dev.P_NUMPRO - 1))
                           == (p + 1 & ~(tc.dev.P_NUMPRO - 1))):
                        p += 1
                        span += max_col[p] + 1 - min_col[p]
                    assert addr % 16 == 0
                    if not mexpress:
                        apb.output('  memcpy_96to128((uint32_t *)'
                                   f' 0x{addr + min_col[start] * 16:08x},'
                                   f' kernels_{start}, {span});\n')
                    else:
                        apb.output(
                            '  *((volatile uint8_t *)'
                            f' 0x{addr + min_col[start] * 4 | 0x01:08x}) = 0x01; '
                            '// Set address\n')
                        apb.output(
                            f'  memcpy32((uint32_t *) 0x{addr:08x}, '
                            f'kernels_{start}, {(span * 9 + 3) // 4});\n')
                p += 1

            apb.output('}\n\n')

    return kern_offs, kern_len
Exemple #16
0
def main(
    memfile,
    classification_layer=False,
    unload=False,
    softmax=False,
    embedded_code=False,
    oneshot=0,
    stopstart=False,
    riscv=None,
    riscv_exclusive=False,
    riscv_flash=False,  # pylint: disable=unused-argument
    riscv_cache=False,
    riscv_debug=False,
    riscv_debugwait=True,
    camera=False,
    camera_format=None,
    device=84,
    channels=None,
    sleep=False,
    output_width=8,
    num_classes=None,
    clock_trim=None,
    embedded_arm=False,
    groups=None,
    boost=None,
    forever=False,
    fifo=False,
    mexpress=False,
):
    """
    Write the main function (including an optional call to the fully connected layer if
    `classification_layer` is `True`) to `memfile`.
    """
    assert groups is not None
    mask = 0
    for _, group in enumerate(groups):
        mask |= 1 << group
    unmask = ~mask & ((1 << tc.dev.P_NUMGROUPS) - 1)

    if softmax and output_width == 8:
        eprint('--softmax should only be used with `output_width: 32`',
               error=False)

    if unload:
        memfile.write(f'#define NUM_OUTPUTS {num_classes}\n')
        memfile.write(f'static int{output_width}_t ml_data[NUM_OUTPUTS];\n\n')

    memfile.write('int main(void)\n{\n')
    if clock_trim is not None and not riscv:
        memfile.write('  uint32_t trim;\n')
    if embedded_code and (classification_layer or softmax) or oneshot > 0:
        memfile.write('  int i;\n')
    if embedded_arm and riscv_debugwait:
        memfile.write('  int i;\n')
    if embedded_code and (classification_layer or softmax):
        memfile.write('  int digs, tens;\n')

    if riscv is None or not riscv:
        if embedded_code or embedded_arm:
            if device == 84:
                memfile.write('  icache_enable();\n\n')
                memfile.write('  SYS_ClockEnable(SYS_PERIPH_CLOCK_AI);\n')
            else:
                memfile.write(
                    '\n  MXC_ICC_Enable(MXC_ICC0); // Enable cache\n\n')
                if clock_trim is not None:
                    memfile.write('  // Manual clock trim override:\n')
                    memfile.write('  *((volatile uint32_t *) 0x40000c00) = 1; '
                                  '// Set TME\n')
                    if clock_trim[0] or clock_trim[1]:
                        memfile.write(
                            '  trim = *((volatile uint32_t *) 0x40005420);\n')
                        if clock_trim[0]:
                            memfile.write('  trim &= ~0xffff;\n'
                                          f'  trim |= 0x{clock_trim[0]:x}; '
                                          '// HIRC8M (7.3728 MHz) trim\n')
                        if clock_trim[1]:
                            memfile.write(
                                '  trim &= ~(0x1ff << 22);\n'
                                f'  trim |= 0x{clock_trim[1]:x} << 22; '
                                '// HIRC (60 MHz) trim\n')
                        memfile.write(
                            '  *((volatile uint32_t *) 0x40005420) = trim;\n')
                    if clock_trim[2]:
                        memfile.write(
                            '  trim = *((volatile uint32_t *) 0x40005440) & '
                            '~(0x1ff << 15);\n')
                        memfile.write(
                            '  *((volatile uint32_t *) 0x40005440) = '
                            'trim | (0xff << 15); // HILIM\n')
                        memfile.write(
                            '  *((volatile uint32_t *) 0x40006c04) = '
                            f'0x{clock_trim[2]:x}; // HIRC96M (100 MHz) trim\n'
                        )
                    memfile.write('  *((volatile uint32_t *) 0x40000c00) = 0; '
                                  '// Clear TME\n\n')

                memfile.write('  // Switch to 100 MHz clock\n')
                memfile.write('  MXC_SYS_Clock_Select(MXC_SYS_CLOCK_IPO);\n')
                memfile.write('  SystemCoreClockUpdate();\n')

                memfile.write(
                    '\n  // Reset all domains, restore power to CNN\n')
                memfile.write('  MXC_BBFC->reg3 = 0xf; // Reset\n')
                memfile.write(
                    f'  MXC_BBFC->reg1 = 0x{mask:01x}; // Mask memory\n')
                memfile.write(f'  MXC_BBFC->reg0 = 0x{mask:01x}; // Power\n')
                memfile.write(f'  MXC_BBFC->reg2 = 0x{unmask:01x}; // Iso\n')
                memfile.write('  MXC_BBFC->reg3 = 0x0; // Reset\n\n')

                memfile.write(
                    '  MXC_GCR->pclkdiv &= ~(MXC_F_GCR_PCLKDIV_CNNCLKDIV | '
                    'MXC_F_GCR_PCLKDIV_CNNCLKSEL);\n'
                    '  MXC_GCR->pclkdiv |= MXC_S_GCR_PCLKDIV_CNNCLKDIV_DIV1; '
                    '// CNN clock: 100 MHz div 2\n')
                memfile.write(
                    '  MXC_SYS_ClockEnable(MXC_SYS_PERIPH_CLOCK_CNN); '
                    '// Enable CNN clock\n')

                if boost is not None:
                    memfile.write(f'\n  // Configure P{boost[0]}.{boost[1]}, '
                                  'turn on the CNN Boost\n')
                    memfile.write('  mxc_gpio_cfg_t gpio_out;\n')
                    memfile.write(f'  gpio_out.port = MXC_GPIO{boost[0]};\n')
                    memfile.write(
                        f'  gpio_out.mask = MXC_GPIO_PIN_{boost[1]};\n')
                    memfile.write('  gpio_out.pad = MXC_GPIO_PAD_NONE;\n')
                    memfile.write('  gpio_out.func = MXC_GPIO_FUNC_OUT;\n')
                    memfile.write('  MXC_GPIO_Config(&gpio_out);\n')
                    memfile.write(
                        '  MXC_GPIO_OutSet(gpio_out.port, gpio_out.mask);\n')
        else:
            memfile.write('  icache_enable();\n\n')
            if device == 84:
                memfile.write(
                    '  MXC_GCR->perckcn1 &= ~0x20; // Enable CNN clock\n')
            else:
                memfile.write(
                    '  *((volatile uint32_t *) 0x40000c00) = 0x00000001; // Set TME\n'
                )
                memfile.write(
                    '  *((volatile uint32_t *) 0x40006c04) = 0x000001a0; // 96M trim\n'
                )
                memfile.write(
                    '  *((volatile uint32_t *) 0x40000c00) = 0x00000000; '
                    '// Clear TME\n\n')
                memfile.write(
                    '  MXC_GCR->clkcn |= MXC_F_GCR_CLKCN_HIRC96M_EN; // Enable 96M\n'
                )
                memfile.write(
                    '  while ((MXC_GCR->clkcn & MXC_F_GCR_CLKCN_HIRC96M_RDY) == 0) ; '
                    '// Wait for 96M\n')
                memfile.write(
                    '  MXC_GCR->clkcn |= MXC_S_GCR_CLKCN_CLKSEL_HIRC96; // Select 96M\n'
                )

                memfile.write(
                    '\n  // Reset all domains, restore power to CNN\n')
                memfile.write('  MXC_BBFC->reg3 = 0xf; // Reset\n')
                memfile.write(
                    f'  MXC_BBFC->reg1 = 0x{mask:01x}; // Mask memory\n')
                memfile.write(f'  MXC_BBFC->reg0 = 0x{mask:01x}; // Power\n')
                memfile.write(f'  MXC_BBFC->reg2 = 0x{unmask:01x}; // Iso\n')
                memfile.write('  MXC_BBFC->reg3 = 0x0; // Reset\n\n')

                memfile.write(
                    '  MXC_GCR->pckdiv = 0x00010000; // CNN clock 96M div 2\n')
                memfile.write(
                    '  MXC_GCR->perckcn &= ~0x2000000; // Enable CNN clock\n')

        if riscv is not None:
            if riscv_cache:
                if embedded_code or embedded_arm:
                    memfile.write(
                        '\n  MXC_FCR->urvbootaddr = (uint32_t) &__FlashStart_;'
                        '// Set RISC-V boot address\n')
                else:
                    memfile.write(
                        f'  MXC_NBBFC->reg4 = 0x{rv.RISCV_CODE_ORIGIN:08x}; '
                        '// Set RISC-V boot address\n')
            if riscv_exclusive:
                if embedded_code or embedded_arm:
                    memfile.write('  MXC_FCR->urvctrl |= 0x00000001; '
                                  '// Exclusive SRAM access for RISC-V\n')
                else:
                    memfile.write(
                        '  *((volatile uint32_t *) 0x40000814) |= 0x00000001; '
                        '// Exclusive SRAM access for RISC-V (MXC_NBBFC->reg5)\n'
                    )
            if embedded_code or embedded_arm:
                memfile.write(
                    '  MXC_GCR->pclkdis1 &= ~MXC_F_GCR_PCLKDIS1_CPU1; '
                    '// Enable RISC-V clock\n')
            else:
                memfile.write(
                    '  MXC_GCR->perckcn1 &= ~MXC_F_GCR_PERCKCN1_CPU1; '
                    '// Enable RISC-V clock\n')
        memfile.write('\n')
    elif riscv:
        if riscv_debug and embedded_code:
            memfile.write('  Debug_Init(); // Set up RISCV JTAG\n')
        if riscv_cache:
            if not embedded_code:
                memfile.write('  icache1_enable();\n')
                memfile.write('  invalidate_icache1();\n\n')
            else:
                memfile.write(
                    '  MXC_ICC_Enable(MXC_ICC1); // Enable cache\n\n')

    if camera:
        memfile.write('  enable_pcif_clock(); // Enable camera clock\n')
        memfile.write('  set_pcif_gpio_altf();\n\n')
        if camera_format == 555:
            mode = '10'
            comment = '555'
        elif camera_format == 565:
            mode = '12'
            comment = '565'
        else:
            mode = '8'  # Default
            comment = '888'
        memfile.write(
            f'  // Enable {comment} format single image in external timing mode\n'
        )
        memfile.write(
            '  MXC_CAMERAIF0->ctrl = MXC_S_CAMERAIF_CTRL_READ_MODE_SINGLE_IMG +\n'
            f'                        MXC_S_CAMERAIF_CTRL_DATA_WIDTH_{mode}BIT +\n'
            '                        MXC_S_CAMERAIF_CTRL_DS_TIMING_EN_DIS +\n'
            '                        MXC_S_CAMERAIF_CTRL_PCIF_SYS_EN_EN')
        if channels == 3:
            memfile.write(' +\n                        (1<<30);\n\n')
        else:
            memfile.write(';\n\n')

    if riscv is None or riscv:
        if embedded_code:
            memfile.write('  printf("\\n*** CNN Test ***\\n");\n\n')

        if embedded_code:
            memfile.write('  if (!cnn_load()) fail();\n')
            memfile.write('  MXC_TMR_SW_Start(MXC_TMR0);\n')
        else:
            memfile.write('  if (!cnn_load()) { fail(); pass(); return 0; }\n')

        if stopstart:
            memfile.write('\n  cnn_stop();\n')
            memfile.write('  cnn_restart();\n\n')

        memfile.write('  cnn_wait();\n\n')
        if oneshot > 0:
            memfile.write(f'  for (i = 0; i < {oneshot}; i++) {{\n')
            memfile.write('    cnn_restart();\n')
            memfile.write('    cnn_wait();\n')
            memfile.write('  }\n\n')

        if not forever and boost is not None:
            memfile.write('  // Turn off the CNN Boost\n')
            memfile.write(
                '  MXC_GPIO_OutClr(gpio_out.port, gpio_out.mask);\n\n')

        memfile.write('  if (!cnn_check()) fail();\n')
        if classification_layer or softmax:
            memfile.write(
                f'  if (!{"softmax" if softmax else "fc"}_layer()) fail();\n')
        elif unload:
            memfile.write(f'  cnn_unload((uint{output_width}_t *) ml_data);\n')
        if classification_layer:
            memfile.write('  if (!fc_verify()) fail();\n')

        if embedded_code:
            memfile.write('\n  printf("\\n*** PASS ***\\n\\n");\n\n')
            memfile.write(
                '  printf("Time for CNN: %d us\\n\\n", cnn_time);\n\n')

        if not forever:
            memfile.write('  // Disable power to CNN\n')
            memfile.write('  MXC_BBFC->reg3 = 0xf; // Reset\n')
            memfile.write('  MXC_BBFC->reg1 = 0x0; // Mask memory\n')
            memfile.write('  MXC_BBFC->reg0 = 0x0; // Power\n')
            memfile.write('  MXC_BBFC->reg2 = 0xf; // Iso\n')
            memfile.write('  MXC_BBFC->reg3 = 0x0; // Reset\n\n')

        if not forever:
            if classification_layer or softmax:
                memfile.write(
                    '  printf("Classification results:\\n");\n'
                    '  for (i = 0; i < NUM_OUTPUTS; i++) {\n'
                    '    digs = (1000 * ml_softmax[i] + 0x4000) >> 15;\n'
                    '    tens = digs % 10;\n'
                    '    digs = digs / 10;\n'
                    '    printf("[%7d] -> Class %d: %d.%d%%\\n", '
                    f'{"fc_output" if classification_layer else "ml_data"}[i], '
                    'i, digs, tens);\n'
                    '  }\n\n')
        else:
            memfile.write(
                '  printf("Starting endless loop...\\n");\n\n  LED_On(1);\n\n')

            memfile.write('  while(1) {\n')

            gval = tc.dev.READY_SEL << 1
            if fifo:
                gval |= 1 << 15
            if device != 84:
                gval |= 1 << 3  # Enable clocks
            if mexpress:
                gval |= 1 << 20

            for _, group in enumerate(groups):
                addr = tc.dev.APB_BASE + tc.dev.C_GROUP_OFFS*group + tc.dev.C_CNN_BASE \
                    + tc.dev.REG_CTL*4
                memfile.write(
                    f'    *((volatile uint32_t *) 0x{addr:08x}) = 0x{gval:08x}; '
                    '// Stop SM\n')
            for _, group in enumerate(groups):
                val = gval | 0x800
                if group > 0:
                    val |= 0x01
                addr = tc.dev.APB_BASE + tc.dev.C_GROUP_OFFS*group + tc.dev.C_CNN_BASE \
                    + tc.dev.REG_CTL*4
                memfile.write(
                    f'    *((volatile uint32_t *) 0x{addr:08x}) = 0x{val:08x}; '
                    f'// Enable group {group}\n')

            addr = tc.dev.APB_BASE + tc.dev.C_CNN_BASE \
                + tc.dev.REG_CTL*4
            memfile.write(
                f'    *((volatile uint32_t *) 0x{addr:08x}) = 0x{gval | 0x01:08x}; '
                '// Master enable group 0\n')

            memfile.write(f'    while ((*((volatile uint32_t *) '
                          f'0x{tc.dev.APB_BASE + tc.dev.C_CNN_BASE:08x}) '
                          '& (1<<12)) != 1<<12) ;\n')

            memfile.write('  }\n')

    if riscv is not None and not riscv:
        if sleep:
            memfile.write(
                '  SCB->SCR |= SCB_SCR_SLEEPDEEP_Msk; // SLEEPDEEP=1\n')
        if embedded_arm:
            if riscv_debugwait:
                memfile.write('  for (i = 0; i < (1 << 27); i++); '
                              '// Let debugger interrupt if needed\n')
            memfile.write('  __WFI(); // Let RISC-V run\n')
        else:
            memfile.write('  asm volatile("wfi"); // Let RISC-V run\n')

    if not embedded_code and not embedded_arm:
        memfile.write('  pass();\n')
    memfile.write('  return 0;\n}\n\n')
Exemple #17
0
def conv2d(
        data,
        weight,
        bias,
        input_size,
        output_size,
        kernel_size,
        stride,
        pad,
        dilation,
        fractional_stride,
        output_pad,
        groups=1,
        debug=False,
):
    """
    Compute a 2D convolution.

    Note that all PyTorch numbers are ordered (C, H, W)
    """
    assert data.shape == tuple(input_size)
    in_channels = input_size[0]
    out_channels = output_size[0]

    if debug:
        # Slow route using pure Python
        ref = np.full(shape=output_size, fill_value=np.nan, dtype=np.int64)
        debug_print('k,c,x,y,weight,data,prod,cacc,acc')

        for k in range(out_channels):
            for y in range(-pad[0],
                           input_size[1] - dilation[0] * (kernel_size[0] - 1) + pad[0],
                           stride[0]):
                for y_frac in range(fractional_stride[0]):
                    for x in range(-pad[1],
                                   input_size[2] - dilation[1] * (kernel_size[1] - 1) + pad[1],
                                   stride[1]):
                        for x_frac in range(fractional_stride[1]):
                            val = np.int64(0)
                            c = 0
                            while True:
                                dc = c if groups == 1 else c + k * (in_channels // groups)
                                sval = np.int(0)
                                for h in range(kernel_size[0]):
                                    for w in range(kernel_size[1]):
                                        ypos = (y + pad[0])*fractional_stride[0] - pad[0] \
                                            + y_frac + h * dilation[0]
                                        yd, yr = divmod(ypos, fractional_stride[0])
                                        xpos = (x + pad[1])*fractional_stride[1] - pad[1] \
                                            + x_frac + w * dilation[1]
                                        xd, xr = divmod(xpos, fractional_stride[1])
                                        if yr == 0 and 0 <= yd < input_size[1] and \
                                           xr == 0 and 0 <= xd < input_size[2]:
                                            prod = weight[k][c][h][w] * data[dc][yd][xd]
                                            sval += prod
                                            val += prod
                                            stats.true_macc += 1
                                            debug_print(
                                                f'{k},{c},{x},{y},{weight[k][c][h][w]},'
                                                f'{data[dc][yd][xd]},{prod},{sval},{val}'
                                            )
                                c += 16
                                if c >= in_channels // groups:
                                    c = (c + 1) % 16
                                    if c in (0, in_channels // groups):
                                        break

                            if bias is not None:
                                val += bias[k]
                                debug_print(
                                    f'     adding bias: {bias[k]} -> result: {val}'
                                )

                            ref[k][
                                ((y + pad[0])*fractional_stride[0] + y_frac) // stride[0]
                            ][
                                ((x + pad[1])*fractional_stride[1] + x_frac) // stride[1]
                            ] = val

    # Fast computation using NumPy

    # Stretch data for fractionally-strided convolution
    if fractional_stride[0] > 1 or fractional_stride[1] > 1:
        ndata = np.zeros((data.shape[0],
                          data.shape[1] * fractional_stride[0],
                          data.shape[2] * fractional_stride[1]),
                         dtype=data.dtype)
        ndata[:, 0::fractional_stride[0], 0::fractional_stride[1]] = data
        data = ndata

    # Create zero padding around data and stretch weights for dilation.
    if pad[0] or pad[1] or output_pad[0] or output_pad[1]:
        data = np.pad(data, pad_width=((0, 0),
                                       (pad[0], pad[0]),
                                       (pad[1], pad[1])),
                      mode='constant', constant_values=0)

    if dilation[0] > 1 or dilation[1] > 1:
        nweight = np.zeros((weight.shape[0], weight.shape[1],
                            (kernel_size[0] - 1) * dilation[0] + 1,
                            (kernel_size[1] - 1) * dilation[1] + 1),
                           dtype=weight.dtype)
        nweight[:, :, 0::dilation[0], 0::dilation[1]] = weight
        weight = nweight

    h = (data.shape[1] - weight.shape[3] + 1) // stride[0]  # Resulting output height
    w = (data.shape[2] - weight.shape[2] + 1) // stride[1]  # Resulting output width

    view = as_strided(data,
                      shape=(h, w, data.shape[0], weight.shape[2], weight.shape[3]),
                      strides=((data.strides[1] * stride[0], data.strides[2] * stride[1],
                                data.strides[0], data.strides[1], data.strides[2])),
                      writeable=False)

    if groups > 1:
        nweight = np.zeros((weight.shape[0], in_channels, weight.shape[2], weight.shape[3]),
                           dtype=weight.dtype)
        for i in range(weight.shape[0]):
            for j in range(in_channels // groups):
                nweight[i, i * (in_channels // groups) + j, :, :] = weight[i, j, :, :]
        weight = nweight

    output = np.tensordot(view, weight, axes=((2, 3, 4), (1, 2, 3))).transpose(2, 0, 1)

    # Apply bias
    if bias is not None:
        for k in range(out_channels):
            output[k] += bias[k]

    if debug:
        if not (ref == output).all():
            eprint('NumPy <-> Python mismatch in compute.conv2d')
            sys.exit(1)

    assert output.shape == tuple(output_size)

    return output
Exemple #18
0
def load(
        verbose,  # pylint: disable=unused-argument
        embedded_code,
        apb,
        layers,
        bias,
        quantization,  # pylint: disable=unused-argument
        group_map,
        output_chan,
        streaming,
        debug,  # pylint: disable=unused-argument
):
    """
    Write `bias` values for the network to C code.
    """
    # Bias: Each group has one bias memory (size BIAS_SIZE bytes). Use only the bias memory in
    # one selected group for the layer, and only if the layer uses a bias. Keep track of the
    # offsets so they can be programmed into the mask count register later.

    if embedded_code:
        bias_values = np.zeros((tc.dev.P_NUMGROUPS, tc.dev.BIAS_SIZE),
                               dtype=np.int64)

    group_bias_max = [0] * tc.dev.P_NUMGROUPS
    bias_offs = [None] * layers
    bias_group = [None] * layers
    for ll in range(layers):
        if bias[ll] is None:
            continue
        if len(bias[ll]) != output_chan[ll]:
            eprint(
                f'Layer {ll}: output channel count {output_chan[ll]} does not match the number '
                f'of bias values {len(bias[ll])}.')
            sys.exit(1)
        q = 8  # Fixed to 8 bits instead of quantization[ll]
        qfactor = 8 // q
        # Round up the divided length of bias values
        # FIXME: Is it necessary to handle gaps in the next layer?
        bias_len = (output_chan[ll] + qfactor - 1) // qfactor

        if ll == 0 and streaming[ll] and tc.dev.FIX_STREAM_BIAS:
            # Work around a problem on AI85
            bias_len += 1
        if streaming[ll] and tc.dev.FIX_STREAM_BIAS:
            eprint(
                f'Layer {ll} uses streaming and a bias. '
                'THIS COMBINATION MIGHT NOT BE FUNCTIONING CORRECTLY!!!',
                error=False)

        # Pick the group with the least amount of data in it
        group = argmin(group_bias_max[t] for t in group_map[ll])
        if group_bias_max[group] + bias_len > tc.dev.BIAS_SIZE:
            eprint(
                f'Layer {ll}: bias memory capacity exceeded - available groups: '
                f'{group_map[ll]}, used so far: {group_bias_max}, needed: {bias_len}.'
            )
            sys.exit(1)
        bias_group[ll] = group
        bias_offs[ll] = group_bias_max[group]
        # Each layer has output_channel number of bias values
        i = 0
        target_offs = 0
        if ll == 0 and streaming[ll] and tc.dev.FIX_STREAM_BIAS:
            # Work around a problem on AI85
            if not embedded_code:
                apb.write_bias(group, bias_offs[ll], 0)
            else:
                # Store for later
                bias_values[group][bias_offs[ll]] = 0
            target_offs += 1
        while i < output_chan[ll]:
            b = combine(bias[ll], q, i, output_chan[ll])
            if not embedded_code:
                apb.write_bias(group, bias_offs[ll] + target_offs, b)
            else:
                # Store for later
                bias_values[group][bias_offs[ll] + target_offs] = b & 0xff
            i += qfactor
            target_offs += 1
        group_bias_max[group] += bias_len

    if embedded_code:
        if max(group_bias_max) > 0:
            # At least one bias value exists, output defines
            for group in range(tc.dev.P_NUMGROUPS):
                if group_bias_max[group] == 0:
                    continue  # but not for this group
                apb.output_define(bias_values[group][:group_bias_max[group]],
                                  f'BIAS_{group}', '0x%02x', 16)
            # Output variables
            for group in range(tc.dev.P_NUMGROUPS):
                if group_bias_max[group] == 0:
                    continue
                apb.output(
                    f'static const uint8_t bias_{group}[] = BIAS_{group};\n')
            apb.output('\n')

            # Finally, create function and do memcpy()
            apb.output(
                'void memcpy_8to32(uint32_t *dst, const uint8_t *src, size_t n)\n{\n'
            )
            apb.output('  while (n-- > 0) {\n    *dst++ = *src++;\n  }\n}\n\n')

            apb.output('void load_bias(void)\n{\n')
            for group in range(tc.dev.P_NUMGROUPS):
                if group_bias_max[group] == 0:
                    continue
                addr = apb.apb_base + tc.dev.C_GROUP_OFFS * group + tc.dev.C_BRAM_BASE
                apb.output(
                    f'  memcpy_8to32((uint32_t *) 0x{addr:08x}, bias_{group}, '
                    f'sizeof(uint8_t) * {group_bias_max[group]});\n')
            apb.output('}\n\n')

    return bias_offs, bias_group, group_bias_max
Exemple #19
0
def extract_funct(asm_file, funct_name, line_num, dwarf_loc):
    """Constructs a function from the assembly file. 

    File pointer must point at first instruction of the function. The return 
    dictionary and target list of site are not built here.

    Only fields initialized in a function's contstructor are initialized. 
    However, each site of a function has its return dictionary linked to the 
    function's return dictionary
    """
    start_line_num = line_num
    call_list = ["call", "callf", "callq"]
    returns = ["ret", "retf", "iret", "retq", "iretq"]
    jmp_list = [
        "jo", "jno", "jb", "jnae", "jc", "jnb", "jae", "jnc", "jz", "je",
        "jnz", "jne", "jbe", "jna", "jnbe", "ja", "js", "jns", "jp", "jpe",
        "jnp", "jpo", "jl", "jnge", "jnl", "jge", "jle", "jng", "jnle", "jg",
        "jecxz", "jrcxz", "jmp", "jmpe"
    ]
    CALL_SITE, RETURN_SITE, INDIR_JMP_SITE, PLT_SITE, = 0, 1, 2, 3

    asm_line = asm_file.readline()
    line_num += 1
    try:
        first_word = asm_line.split()[0]
    except IndexError:
        pass  # ignore empty line

    comment_continues = False
    sites = []
    direct_call_sites = []
    empty_ret_dict = dict()

    while asm_line:
        asm_parsing.update_dwarf_loc(asm_line, dwarf_loc)
        try:
            first_word = asm_line.split()[0]
        except IndexError:
            # ignore empty line
            asm_line = asm_file.readline()
            line_num += 1
            continue

        if first_word[:len('.LFE')] == '.LFE':
            break
        else:
            targets = []
            labels, key_symbol, arg_str, comment_continues = (
                asm_parsing.decode_line(asm_line, comment_continues))

        if key_symbol in call_list:
            new_site = funct_cfg.Site(line_num, targets, CALL_SITE, dwarf_loc)

            if '%' not in arg_str:
                new_site.targets.append(arg_str)
                direct_call_sites.append(new_site)
            sites.append(new_site)
        elif key_symbol in returns:
            # empty return dict passed so that every site's return dict is
            # a reference to the function's return dict
            new_ret_site = funct_cfg.Site(line_num, empty_ret_dict,
                                          RETURN_SITE, dwarf_loc)
            #new_ret_site.cdi_return_sites.append(funct_cfg.CDIRetSite(asm_file))
            sites.append(new_ret_site)

        elif key_symbol in jmp_list:
            if '%' in arg_str:
                sites.append(
                    funct_cfg.Site(line_num, targets, INDIR_JMP_SITE,
                                   dwarf_loc))
        asm_line = asm_file.readline()
        line_num += 1
    else:
        eprint(
            dwarf_loc.filename() + ':' + asm_file.name + ':' + start_line_num +
            ' error: unterminated function: ', funct_name)
        sys.exit(1)

    fn = dwarf_loc.filename()
    new_funct = funct_cfg.Function(funct_name, asm_file.name, fn, sites,
                                   start_line_num)
    new_funct.direct_call_sites = direct_call_sites
    new_funct.ret_dict = empty_ret_dict

    return new_funct, line_num
Exemple #20
0
def create_net(  # pylint: disable=too-many-arguments,too-many-locals,too-many-branches
    prefix,
    verbose,
    verbose_all,
    debug,
    log,
    layers,
    operator,
    auto_input_dim,
    input_dim,
    pooled_dim,
    output_dim,
    kernel_size,
    quantization,  # pylint: disable=unused-argument
    output_shift,
    input_chan,
    output_chan,
    conv_groups,
    output_width,
    padding,
    dilation,
    stride,
    pool,
    pool_stride,
    pool_average,
    activation,
    data,
    kernel,
    bias,
    fc_weights,
    fc_bias,
    flatten,
    operands,
    eltwise,
    pool_first,
    in_sequences,
    c_filename,
    base_directory,
    log_filename,
    weight_filename,
    sample_filename,
    avg_pool_rounding,
    device=84,
    legacy_test=False,
):
    """
    Create the CMSIS NN network.
    """
    if output_width[-1] != 8:
        eprint(
            'CMSIS network generator does not currently support `output_width` that is not 8. '
            'Forcing to 8 bit.',
            error=False)  # FIXME: Support 32-bit output
        output_width[-1] = 8

    input_dim_str = [None] * layers
    output_dim_str = [None] * layers
    kernel_size_str = [None] * layers
    pool_str = [None] * layers
    padding_str = [None] * layers
    pool_stride_str = [None] * layers
    stride_str = [None] * layers

    for ll in range(layers):
        if quantization[ll] is None:
            quantization[ll] = 8  # Set default
        elif quantization[ll] != 8:  # FIXME: Support quantization
            eprint(
                'CMSIS network generator does not currently support `quantization` != 8.'
            )
            sys.exit(1)

        if output_shift[ll] is None:
            output_shift[ll] = 0  # Set default

        if operator[ll] != op.CONV1D:
            input_dim_str[ll] = f'{input_dim[ll][0]}x{input_dim[ll][1]}'
            output_dim_str[ll] = f'{output_dim[ll][0]}x{output_dim[ll][1]}'
            kernel_size_str[ll] = f'{kernel_size[ll][0]}x{kernel_size[ll][1]}'
            pool_str[ll] = f'{pool[ll][0]}x{pool[ll][1]}' \
                if pool[ll][0] > 1 or pool[ll][1] > 1 else '0x0'
            padding_str[ll] = f'{padding[ll][0]}/{padding[ll][1]}'
            pool_stride_str[ll] = f'{pool_stride[ll][0]}/{pool_stride[ll][1]}'
            stride_str[ll] = f'{stride[ll][0]}/{stride[ll][1]}'
        else:
            input_dim_str[ll] = f'{input_dim[ll][0]}'
            output_dim_str[ll] = f'{output_dim[ll][0]}'
            kernel_size_str[ll] = f'{kernel_size[ll][0]}'
            pool_str[ll] = f'{pool[ll][0]}' \
                if pool[ll][0] > 1 or pool[ll][1] > 1 else '0'
            padding_str[ll] = f'{padding[ll][0]}'
            pool_stride_str[ll] = f'{pool_stride[ll][0]}'
            stride_str[ll] = f'{stride[ll][0]}'

        if input_chan[ll] % conv_groups[ll] != 0 or output_chan[
                ll] % conv_groups[ll] != 0:
            eprint(
                f'Layer {ll}: convolution groups {conv_groups[ll]} does not divide'
                f' the input channels {input_chan[ll]} or output channels {output_chan[ll]}.'
            )
            sys.exit(1)

    test_name = prefix
    print(f'{test_name}...')

    os.makedirs(os.path.join(base_directory, test_name), exist_ok=True)

    # Redirect stdout?
    if log:
        sys.stdout = open(
            os.path.join(base_directory, test_name, log_filename), 'w')
        print(f'{" ".join(str(x) for x in sys.argv)}')
        print(f'{devices.partnum(device)}\n')
        print(f'{test_name}')

    filename = c_filename + '.c'
    sampledata_header = \
        open(os.path.join(base_directory, test_name, sample_filename), mode='w')
    weight_header = \
        open(os.path.join(base_directory, test_name, weight_filename), mode='w')

    with open(os.path.join(base_directory, test_name, filename),
              mode='w') as c_file:
        toplevel.copyright_header(c_file)

        c_file.write(f'// {test_name}\n')
        c_file.write(
            f'// Created using {" ".join(str(x) for x in sys.argv)}\n')
        c_file.write('\n')

        toplevel.header(c_file, 0, embedded_code=True, cmsis_nn=True)

        # Pre-define data memory loader.
        d = data.transpose((1, 2, 0)).flatten()  # CHW -> HWC
        toplevel.c_define(sampledata_header, d, 'INPUT_DATA', '%d', 16)
        input_size = d.size
        c_file.write('static const q7_t input_data[] = INPUT_DATA;\n')
        c_file.write(
            f'static const q{output_width[-1]-1}_t output_data[] = OUTPUT_DATA; '
            '// Last conv layer output\n')

        # Pre-define the kernels and bias values
        for ll in range(layers):
            # Rearrange kernels when emulating a fully connected network using 1x1 Conv2D
            # CMSIS data uses HWC, PyTorch uses CHW
            if operator[ll] != op.NONE:
                if kernel_size[ll] == [1, 1] and input_dim[ll] == [1, 1]:
                    w = kernel[ll]. \
                        reshape((output_chan[ll],
                                input_chan[ll] // (auto_input_dim[ll][0] * auto_input_dim[ll][1]),
                                auto_input_dim[ll][0], auto_input_dim[ll][1],
                                kernel_size[ll][0], kernel_size[ll][1])). \
                        transpose((0, 4, 5, 2, 3, 1)). \
                        flatten()
                elif flatten[ll]:
                    w = kernel[ll]. \
                        reshape((output_chan[ll],
                                input_chan[ll],
                                auto_input_dim[ll][0], auto_input_dim[ll][1],
                                kernel_size[ll][0], kernel_size[ll][1])). \
                        transpose((0, 4, 5, 2, 3, 1)). \
                        flatten()
                else:
                    w = kernel[ll]. \
                        reshape((output_chan[ll], input_chan[ll],
                                kernel_size[ll][0], kernel_size[ll][1])). \
                        transpose((0, 2, 3, 1)). \
                        flatten()
                toplevel.c_define(weight_header, w, f'WEIGHTS_{ll}', '%d', 16)
                if bias[ll] is not None:
                    b = bias[ll].flatten()
                else:
                    # We need empty bias values (the Arm code needs them both for rounding of
                    # the shifted output, and it does not like NULL bias pointers)
                    b = np.zeros(output_chan[ll], dtype=np.int64)
                toplevel.c_define(weight_header, b, f'BIAS_{ll}', '%d', 16)
        c_file.write('\n')

        for ll in range(layers):
            if operator[ll] != op.NONE:
                c_file.write(
                    f'static const q7_t weights_{ll}[] = WEIGHTS_{ll};\n')
                c_file.write(f'static const q7_t bias_{ll}[] = BIAS_{ll};\n')
        c_file.write('\n')

        # Compute buffer sizes
        col_buffer_size = 0
        img_buffer_size = 0
        for ll in range(layers):
            col_buffer_size = max(
                col_buffer_size,
                2 * input_chan[ll] * kernel_size[ll][0] * kernel_size[ll][1])
            if pool[ll][0] > 1 or pool[ll][1] > 1:
                col_buffer_size = max(col_buffer_size, pooled_dim[ll][0] *
                                      input_chan[ll])  # q15_t doesn't need 2*
            img_buffer_size = max(
                img_buffer_size,
                input_chan[ll] * input_dim[ll][0] * input_dim[ll][1],
                output_chan[ll] * output_dim[ll][0] * output_dim[ll][1])

        c_file.write(
            f'static q7_t buffer0[{max(img_buffer_size, input_size)}];\n')
        c_file.write(f'static q7_t buffer1[{img_buffer_size}];\n')
        c_file.write(f'static q15_t col_buffer[{col_buffer_size}];\n\n')

        c_file.write('int cnn_run(const q7_t *input, int input_size, '
                     'q7_t **output, int *output_size)\n{\n')

        # Compute layer-by-layer output and chain results into input
        buffer0, buffer1 = 'buffer0', 'buffer1'

        def run_eltwise(
            data,
            ll,
        ):
            """
            In-flight element-wise operations
            """
            if operator[ll] == op.NONE:
                # Let element-wise do 32-bit, else 8-bit only
                o_width = output_width[ll]
            else:
                o_width = 8
            d_shape = data.shape

            data, out_size = eltwise_layer(
                eltwise[ll],
                ll,
                verbose,
                verbose_all or ll == layers - 1,
                data[0].shape,
                output_shift[ll],
                data,
                output_width=o_width,
                device=device,
                debug=False,
                operands=operands[ll],
            )
            assert out_size[0] == d_shape[1] \
                and out_size[1] == d_shape[2] and out_size[2] == d_shape[3]

            return data

        data_buf = [data]
        # Compute layer-by-layer output and chain results into input
        for ll in range(layers):
            # Concatenate input data if needed
            if in_sequences[ll] is not None:
                if isinstance(in_sequences[ll], list):
                    try:
                        data = np.concatenate(
                            [data_buf[i + 1] for i in in_sequences[ll]],
                            axis=0)
                    except ValueError as err:
                        eprint('Error in input data concatenation layer:', err)
                        sys.exit(1)
                else:
                    data = data_buf[in_sequences[ll] + 1]
            else:
                data = data_buf[-1]

            # Split data into multiple inputs if needed
            if operands[ll] > 1:
                if ll == 0 and legacy_test:
                    data = np.array(np.split(data, operands[ll], axis=0))
                elif legacy_test:
                    d = np.empty((operands[ll], data.shape[0], data.shape[1],
                                  data.shape[2] // operands[ll]),
                                 dtype=np.int64)
                    for i in range(operands[ll]):
                        d[i, :, :, :] = data[:, :, i::operands[ll]]
                    data = d
                else:
                    data = np.array(np.split(data, operands[ll], axis=0))
            else:
                data = np.expand_dims(data, 0)

            show_data(
                ll,
                verbose,
                verbose_all or ll == layers - 1,
                data.shape,
                data,
                debug=False,
                expand=1,
                expand_thresh=1,
                operation=operator[ll],
                operands=operands[ll],
            )

            in_chan = input_chan[ll]

            # Run in-flight element-wise operations first?
            if operands[ll] > 1 and not pool_first[ll]:
                eprint(
                    "Element-wise operations are currently not implemented for CMSIS-NN"
                )
                sys.exit(1)  # FIXME: Support element-wise operations
                data = np.expand_dims(run_eltwise(data, ll), 0)

            # Allow 1D <-> 2D and 2D W/L conversions
            if operator[ll] == op.CONV1D:
                assert input_dim[ll][1] == 1
                data = data.reshape(data.shape[0], data.shape[1],
                                    input_dim[ll][0])
            else:
                data = data.reshape(data.shape[0], data.shape[1],
                                    input_dim[ll][0], input_dim[ll][1])

            # In-flight pooling
            data, out_size = pooling_layer(
                ll,
                verbose,
                verbose_all or ll == layers - 1,
                data[0].shape,
                pool[ll],
                pool_stride[ll],
                pool_average[ll],
                data,
                debug=False,
                expand=1,
                expand_thresh=1,
                operation=operator[ll],
                operands=data.shape[0],
                rounding=avg_pool_rounding,
                debug_data=None,
            )

            if operator[ll] == op.CONV1D:
                assert out_size[0] == in_chan \
                    and out_size[1] == pooled_dim[ll][0] \
                    and pooled_dim[ll][1] == 1
            else:
                assert out_size[0] == in_chan \
                    and out_size[1] == pooled_dim[ll][0] \
                    and out_size[2] == pooled_dim[ll][1]

            if operands[ll] > 1 and pool_first[ll]:
                data = run_eltwise(data, ll)
            else:
                data = np.squeeze(data, axis=0)

            # Convolution or passthrough
            if operator[ll] == op.CONV2D:
                if flatten[ll]:
                    in_chan *= input_dim[ll][0] * input_dim[ll][1]
                    data = data.reshape(in_chan, 1, 1)
                    if verbose:
                        print(f"FLATTEN TO {in_chan}x1x1...\n")

                out_buf, out_size = conv2d_layer(
                    ll,
                    verbose,
                    verbose_all or ll == layers - 1,
                    data.shape,
                    kernel_size[ll],
                    output_shift[ll],
                    output_chan[ll],
                    padding[ll],
                    dilation[ll],
                    stride[ll],
                    activation[ll],
                    kernel[ll].reshape(output_chan[ll], in_chan,
                                       kernel_size[ll][0], kernel_size[ll][1]),
                    bias[ll],
                    data,
                    output_width=output_width[ll],
                    groups=conv_groups[ll],
                    device=device,
                    debug=False,
                )
            elif operator[ll] == op.CONVTRANSPOSE2D:
                out_buf, out_size = convtranspose2d_layer(
                    ll,
                    verbose,
                    verbose_all or ll == layers - 1,
                    data.shape,
                    kernel_size[ll],
                    output_shift[ll],
                    output_chan[ll],
                    padding[ll],
                    dilation[ll],
                    stride[ll],
                    [1, 1],  # output_padding
                    activation[ll],
                    kernel[ll].reshape(
                        output_chan[ll],
                        in_chan,
                        kernel_size[ll][0],
                        kernel_size[ll][1],
                    ),
                    bias[ll],
                    data,
                    output_width=output_width[ll],
                    groups=conv_groups[ll],
                    device=device,
                    debug=False,
                )
            elif operator[ll] == op.CONV1D:
                out_buf, out_size = conv1d_layer(
                    ll,
                    verbose,
                    verbose_all or ll == layers - 1,
                    data.shape,
                    kernel_size[ll][0],
                    output_shift[ll],
                    output_chan[ll],
                    padding[ll][0],
                    dilation[ll][0],
                    stride[ll][0],
                    activation[ll],
                    kernel[ll].reshape(
                        output_chan[ll],
                        input_chan[ll],
                        kernel_size[ll][0],
                    ),
                    bias[ll],
                    data,
                    output_width=output_width[ll],
                    groups=conv_groups[ll],
                    device=device,
                    debug=False,
                )
            elif operator[ll] == op.NONE:  # '0'D (pooling only or passthrough)
                out_buf, out_size = passthrough_layer(
                    ll,
                    verbose,
                    verbose_all or ll == layers - 1,
                    data.shape,
                    data,
                    device=device,
                    debug=False,
                )
            else:
                eprint(f'Unknown operator `{op.string(operator[ll])}`.')
                sys.exit(1)

            assert out_size[0] == output_chan[ll] \
                and out_size[1] == output_dim[ll][0] and out_size[2] == output_dim[ll][1]

            c_file.write(f'  // Layer {ll}: '
                         f'{str(operands[ll])+"x" if operands[ll] > 1 else ""}'
                         f'{input_chan[ll]}x{input_dim_str[ll]}'
                         f'{" flattened, " if flatten[ll] else ", "}')
            if pool[ll][0] > 1 or pool[ll][1] > 1:
                c_file.write(
                    f'{pool_str[ll]} {"avg" if pool_average[ll] else "max"} '
                    f'pool with stride {pool_stride_str[ll]}')
            else:
                c_file.write('no pooling')
            if operator[ll] in [op.CONV1D, op.CONV2D, op.CONVTRANSPOSE2D]:
                conv_str = f', {op.string(operator[ll])} with kernel size ' \
                           f'{kernel_size_str[ll]}, ' \
                           f'stride {stride_str[ll]}, ' \
                           f'pad {padding_str[ll]}, '
            else:
                conv_str = ', no convolution, '
            c_file.write(conv_str +
                         f'{output_chan[ll]}x{output_dim_str[ll]} output\n')

            c_file.write(
                f'  // Dimensions: [{input_chan[ll]}, {input_dim[ll][0]}, '
                f'{input_dim[ll][1]}]')
            if pool[ll][0] > 1 or pool[ll][1] > 1:
                c_file.write(
                    f' -> [{input_chan[ll]}, {pooled_dim[ll][0]}, {pooled_dim[ll][1]}]'
                )
            if flatten[ll]:
                c_file.write(
                    f' -> [{input_chan[ll]*pooled_dim[ll][0]*pooled_dim[ll][1]}, 1, 1]'
                )
            if operator[ll] != op.NONE:
                c_file.write(f' -> {out_size}\n')
            else:
                c_file.write('\n')

            source = 'input_data' if ll == 0 else buffer0

            if pool[ll][0] > 1 or pool[ll][1] > 1:
                if ll == 0:
                    c_file.write('  memcpy(buffer0, input, input_size);'
                                 ' // Pooling may destroy input\n')
                pool_type = 'ave' if pool_average[ll] else 'max'
                if pool[ll][0] != pool[ll][1]:
                    c_file.write(
                        f'  arm_{pool_type}pool_nonsquare_q7_HWC_nonsquare({buffer0}, '
                        f'{input_dim[ll][1]}, {input_dim[ll][0]}, '
                        f'{input_chan[ll]}, {pool[ll][1]}, {pool[ll][0]}, 0, 0, '
                        f'{pool_stride[ll][1]}, {pool_stride[ll][0]}, '
                        f'{pooled_dim[ll][1]}, {pooled_dim[ll][0]}, '
                        f'(q7_t *) col_buffer, {buffer1});\n')
                else:
                    if input_dim[ll][0] == input_dim[ll][1]:
                        c_file.write(
                            f'  arm_{pool_type}pool_q7_HWC({buffer0}, '
                            f'{input_dim[ll][0]}, {input_chan[ll]}, '
                            f'{pool[ll][0]}, 0, {pool_stride[ll][0]}, '
                            f'{pooled_dim[ll][0]}, (q7_t *) col_buffer, {buffer1});\n'
                        )
                    else:
                        c_file.write(
                            f'  arm_{pool_type}pool_q7_HWC_nonsquare({buffer0}, '
                            f'{input_dim[ll][1]}, {input_dim[ll][0]}, '
                            f'{input_chan[ll]}, {pool[ll][0]}, 0, {pool_stride[ll][0]}, '
                            f'{pooled_dim[ll][1]}, {pooled_dim[ll][0]}, '
                            f'(q7_t *) col_buffer, {buffer1});\n')
                source = buffer1
                buffer0, buffer1 = buffer1, buffer0

            if operator[ll] != op.NONE:
                in_chan = input_chan[ll]
                in_dim = pooled_dim[ll]
                if flatten[ll]:
                    in_chan *= pooled_dim[ll][0] * pooled_dim[ll][1]
                    in_dim = [1, 1]

                if operator[ll] in [op.CONVTRANSPOSE2D
                                    ]:  # FIXME: Support ConvTranspose2d
                    eprint(
                        "CMSIS-NN generator does not currently support the operator "
                        f"`{op.string(operator[ll])}` in layer {ll}")
                    sys.exit(1)

                # FIXME: First check that everything is [-128, +127] and use s8 function otherwise

                # Check for squareness
                if kernel_size[ll][0] == kernel_size[ll][1] \
                   and in_dim[0] == in_dim[1] \
                   and output_dim[ll][0] == output_dim[ll][1] \
                   and padding[ll][0] == padding[ll][1] \
                   and stride[ll][0] == stride[ll][1]:
                    # Detect fully connected layers
                    if in_dim == [1, 1] and output_dim[ll] == [1, 1]:
                        c_file.write(
                            f'  arm_fully_connected_q7({source}, '
                            f'weights_{ll}, {in_chan}, {output_chan[ll]}, 7, 7, '
                            f'bias_{ll}, {buffer1}, '
                            'col_buffer);\n')
                    else:
                        fn = 'fast' if in_chan % 4 == 0 and output_chan[ll] % 2 == 0 \
                            else 'basic'
                        c_file.write(
                            f'  arm_convolve_HWC_q7_{fn}({source}, '
                            f'{in_dim[0]}, '
                            f'{in_chan}, weights_{ll}, {output_chan[ll]}, '
                            f'{kernel_size[ll][0]}, '
                            f'{padding[ll][0]}, '
                            f'{stride[ll][0]}, '
                            f'bias_{ll}, 7, 7, {buffer1}, '
                            f'{output_dim[ll][0]}, '
                            'col_buffer, NULL);\n')
                else:
                    c_file.write(
                        f'  arm_convolve_HWC_q7_basic_nonsquare({source}, '
                        f'{in_dim[1]}, {in_dim[0]}, '
                        f'{in_chan}, weights_{ll}, {output_chan[ll]}, '
                        f'{kernel_size[ll][1]}, {kernel_size[ll][0]}, '
                        f'{padding[ll][1]}, {padding[ll][0]}, '
                        f'{stride[ll][1]}, {stride[ll][0]},\n'
                        '                                      '
                        f'bias_{ll}, 7, 7, {buffer1}, '
                        f'{output_dim[ll][1]}, {output_dim[ll][0]}, '
                        'col_buffer, NULL);\n')

                assert out_size[0] == output_chan[ll] \
                    and out_size[1] == output_dim[ll][0] and out_size[2] == output_dim[ll][1]

                if activation[ll] == op.ACT_RELU:
                    size = output_dim[ll][0] * output_dim[ll][1] * output_chan[
                        ll]
                    if size < 65536:
                        c_file.write(f'  arm_relu_q7({buffer1}, {size});\n')
                    else:
                        c_file.write(f'  arm_relu32_q7({buffer1}, {size});\n')
                elif activation[
                        ll] is not None:  # FIXME: Support abs() activation
                    eprint("CMSIS-NN generator implements ReLU only.")
                    sys.exit(1)
                buffer0, buffer1 = buffer1, buffer0

            data_buf.append(out_buf.reshape(out_size))
            c_file.write('\n')
            data_cmsis = data_buf[-1].transpose((1, 2, 0)).flatten()
            if verbose:
                print('TRANSPOSED (HWC) AND FLATTENED:')
                print(data_cmsis)
                print('')

        data = data_buf[-1]

        c_file.write(f'  *output = {buffer0};\n'
                     f'  *output_size = {data_cmsis.size};\n\n'
                     '  return 1;\n}\n\n')

        if fc_weights:
            data = data.flatten()

            out_buf, out_size = linear_layer(verbose=verbose,
                                             verbose_data=False,
                                             activation=False,
                                             weight=fc_weights[0],
                                             bias=fc_bias[0],
                                             data=data,
                                             debug=debug)

            # Rearrange the weights to account for the shape of the conv layer output
            w = fc_weights[0]. \
                reshape((fc_weights[0].shape[0], output_chan[ll],
                         output_dim[ll][0], output_dim[ll][1])). \
                transpose(0, 2, 3, 1). \
                reshape((fc_weights[0].shape[0], fc_weights[0].shape[1]))

            # np.dot(worg, torg.flatten()) should be equal to np.dot(wnew, tnew.flatten())
            assert (np.dot(fc_weights[0], data) == np.dot(w, data_cmsis)).all()

            toplevel.fc_layer(c_file,
                              weight_header,
                              w,
                              fc_bias[0],
                              cmsis_nn=True)

        c_file.write(
            'int main(void)\n{\n'
            '  int i;\n'
            '  q7_t *output;\n'
            '  int output_size;\n\n'
            f'  cnn_run(input_data, {input_size}, &output, &output_size);\n\n')

        toplevel.c_define(sampledata_header, data_cmsis, 'OUTPUT_DATA', '%d',
                          16)
        c_file.write('  if (memcmp(output_data, output, output_size) == 0)\n'
                     '    printf("*** PASS ***\\n\\n");\n'
                     '  else\n'
                     '    printf("!!! FAIL !!!\\n\\n");\n\n')

        if fc_weights:
            c_file.write('  fc_layer(output);\n\n')
            c_file.write(
                '  printf("Classification results:\\n");\n'
                '  for (i = 0; i < NUM_CLASSES; i++) {\n'
                '    printf("[%6d] -> Class %d: %0.1f%%\\n", fc_output[i], i, '
                '(double) (100.0 * ml_softmax[i] / 32768.0));\n'
                '  }\n\n')
        else:
            c_file.write('  printf("Output of final layer:\\n");\n'
                         '  for (i = 0; i < output_size; i++) {\n'
                         '    printf("%5hhd", (int8_t) (output[i] & 0xff));\n'
                         '    if ((i + 1) % 32 == 0)\n      printf("\\n");\n'
                         '    else if ((i + 1) % 4 == 0)\n      printf(" ");\n'
                         '  }\n'
                         '  printf("\\n");\n'
                         '\n')

        c_file.write('  return 0;\n}\n\n')

    # Close header files
    sampledata_header.close()
    weight_header.close()

    assets.copy('assets', 'cmsis-nn', base_directory, test_name)
Exemple #21
0
        target_host = a
    if o == '-p':
        target_port = int(a)
    if o == '-l':
        listen_port = int(a)

listenTuple = ('', listen_port)
listenSocket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
listenSocket.setblocking(0)
listenSocket.bind(listenTuple)

sendTuple = (target_host, target_port)
sendSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sendSocket.connect(sendTuple)

eprint("Listening on port %d" % (listen_port))
eprint("Sending to %s port %d" % (target_host, target_port))

for msgDict in generateDicts(listenSocket):
    if 'date' not in msgDict:
        msgDict['date'] = makeDate('now')
    else:
        msgDict['date'] = makeDate(msgDict['date'].rstrip(':'))

    if 'msg_type' not in msgDict:
        msgDict['msg_type'] = msgDict['message'].partition(' ')[0].partition(
            '[')[0]

    if 'id' in msgDict:
        msgDict['instance'] = msgDict['id'] + '_' + msgDict['host']
        if 'key_fields' in msgDict:
Exemple #22
0
def create_net(
        prefix,
        verbose,
        debug,
        log,
        layers,
        operator,
        auto_input_dim,
        input_dim,
        pooled_dim,
        output_dim,
        kernel_size,
        quantization,  # pylint: disable=unused-argument
        output_shift,
        input_chan,
        output_chan,
        output_width,
        padding,
        dilation,
        stride,
        pool,
        pool_stride,
        pool_average,
        activate,
        data,
        kernel,
        bias,
        fc_weights,
        fc_bias,
        flatten,
        c_filename,
        base_directory,
        log_filename,
        weight_filename,
        sample_filename,
        device=84,
):
    """
    Create the CMSIS NN network.
    """
    if any(w != 8 for w in output_width):
        eprint('CMSIS network generator does not currently support `output_width` that is not 8.')
        sys.exit(1)

    test_name = prefix
    print(f'{test_name}...')

    os.makedirs(os.path.join(base_directory, test_name), exist_ok=True)

    # Redirect stdout?
    if log:
        sys.stdout = open(os.path.join(base_directory, test_name, log_filename), 'w')
        print(f'{test_name}')

    filename = c_filename + '.c'
    sampledata_header = \
        open(os.path.join(base_directory, test_name, sample_filename), mode='w')
    weight_header = \
        open(os.path.join(base_directory, test_name, weight_filename), mode='w')

    with open(os.path.join(base_directory, test_name, filename), mode='w') as c_file:
        toplevel.copyright_header(c_file)

        c_file.write(f'// {test_name}\n')
        c_file.write(f'// Created using {" ".join(str(x) for x in sys.argv)}\n')

        # Human readable description of test
        c_file.write(f'\n// Configuring {layers} layer{"s" if layers > 1 else ""}:\n')

        for ll in range(layers):
            c_file.write(f'// Layer {ll}: '
                         f'{input_chan[ll]}x{input_dim[ll][0]}x{input_dim[ll][1]}, ')
            if pool[ll][0] > 1 or pool[ll][1] > 1:
                c_file.write(f'{pool[ll][0]}x{pool[ll][1]} {"avg" if pool_average[ll] else "max"} '
                             f'pool with stride {pool_stride[ll]}')
            else:
                c_file.write('no pooling')
            c_file.write(f', {kernel_size[ll][0]}x{kernel_size[ll][1]} convolution '
                         f'with stride {stride[ll]} '
                         f'pad {padding[ll]}, '
                         f'{output_chan[ll]}x{output_dim[ll][0]}x{output_dim[ll][1]} out\n')

        c_file.write('\n')
        toplevel.header(c_file, 0, embedded_code=True, cmsis_nn=True)

        # Pre-define data memory loader.
        d = data.transpose((1, 2, 0)).flatten()  # CHW -> HWC
        toplevel.c_define(sampledata_header, d, 'INPUT_DATA', '%d', 16)
        input_size = d.size
        c_file.write('static const q7_t input_data[] = INPUT_DATA;\n')
        c_file.write('static const q7_t output_data[] = OUTPUT_DATA; // Last conv layer output\n')

        # Pre-define the kernels and bias values
        for ll in range(layers):
            # Rearrange kernels when emulating a fully connected network using 1x1 Conv2D
            # CMSIS data uses HWC, PyTorch uses CHW
            if kernel_size[ll] == [1, 1] and input_dim[ll] == [1, 1]:
                w = kernel[ll]. \
                    reshape((output_chan[ll],
                             input_chan[ll] // (auto_input_dim[ll][0] * auto_input_dim[ll][1]),
                             auto_input_dim[ll][0], auto_input_dim[ll][1],
                             kernel_size[ll][0], kernel_size[ll][1])). \
                    transpose((0, 4, 5, 2, 3, 1)). \
                    flatten()
            elif flatten[ll]:
                w = kernel[ll]. \
                    reshape((output_chan[ll],
                             input_chan[ll],
                             auto_input_dim[ll][0], auto_input_dim[ll][1],
                             kernel_size[ll][0], kernel_size[ll][1])). \
                    transpose((0, 4, 5, 2, 3, 1)). \
                    flatten()
            else:
                w = kernel[ll]. \
                    reshape((output_chan[ll], input_chan[ll],
                             kernel_size[ll][0], kernel_size[ll][1])). \
                    transpose((0, 2, 3, 1)). \
                    flatten()
            toplevel.c_define(weight_header, w, f'WEIGHTS_{ll}', '%d', 16)
            if bias[ll] is not None:
                b = bias[ll].flatten()
            else:
                # We need empty bias values (the Arm code needs them both for rounding of
                # the shifted output, and it does not like NULL bias pointers)
                b = np.zeros(output_chan[ll], dtype=np.int64)
            toplevel.c_define(weight_header, b, f'BIAS_{ll}', '%d', 16)
        c_file.write('\n')

        for ll in range(layers):
            c_file.write(f'static const q7_t weights_{ll}[] = WEIGHTS_{ll};\n')
            c_file.write(f'static const q7_t bias_{ll}[] = BIAS_{ll};\n')
        c_file.write('\n')

        # Compute buffer sizes
        col_buffer_size = 0
        img_buffer_size = 0
        for ll in range(layers):
            col_buffer_size = max(col_buffer_size,
                                  2*input_chan[ll]*kernel_size[ll][0]*kernel_size[ll][1])
            if pool[ll][0] > 1 or pool[ll][1] > 1:
                col_buffer_size = max(col_buffer_size,
                                      pooled_dim[ll][0]*input_chan[ll])  # q15_t doesn't need 2*
            img_buffer_size = max(img_buffer_size,
                                  input_chan[ll]*input_dim[ll][0]*input_dim[ll][1],
                                  output_chan[ll]*output_dim[ll][0]*output_dim[ll][1])

        c_file.write(f'static q7_t buffer0[{max(img_buffer_size, input_size)}];\n')
        c_file.write(f'static q7_t buffer1[{img_buffer_size}];\n')
        c_file.write(f'static q15_t col_buffer[{col_buffer_size}];\n\n')

        c_file.write('int cnn_run(const q7_t *input, int input_size, '
                     'q7_t **output, int *output_size)\n{\n')

        # Compute layer-by-layer output and chain results into input
        buffer0, buffer1 = 'buffer0', 'buffer1'

        for ll in range(layers):
            c_file.write(f'  // Layer {ll}: [{input_chan[ll]}, {input_dim[ll][0]}, '
                         f'{input_dim[ll][1]}] -> ')
            if pool[ll][0] > 1 or pool[ll][1] > 1:
                c_file.write(f'[{input_chan[ll]}, {pooled_dim[ll][0]}, {pooled_dim[ll][1]}] -> ')

            # Add element-wise dimension
            data = np.expand_dims(data, 0)

            in_chan = input_chan[ll]

            # Allow 1D <-> 2D and 2D W/L conversions
            if operator[ll] == op.CONV1D:
                assert input_dim[ll][1] == 1
                data = data.reshape(data.shape[0], data.shape[1], input_dim[ll][0])
            else:
                data = data.reshape(data.shape[0], data.shape[1],
                                    input_dim[ll][0], input_dim[ll][1])

            data, out_size = pooling_layer(
                ll,
                verbose,
                False,
                data[0].shape,
                pool[ll],
                pool_stride[ll],
                pool_average[ll],
                data,
                expand=1,
                expand_thresh=16384,
                operation=operator[ll],
                operands=data.shape[0],
                rounding=False,
                debug=debug,
            )

            if operator[ll] == op.CONV1D:
                assert out_size[0] == in_chan \
                    and out_size[1] == pooled_dim[ll][0] \
                    and pooled_dim[ll][1] == 1
            else:
                assert out_size[0] == in_chan \
                    and out_size[1] == pooled_dim[ll][0] \
                    and out_size[2] == pooled_dim[ll][1]

            # Get rid of element-wise dimension
            data = np.squeeze(data, axis=0)

            if operator[ll] == op.CONV2D:
                if flatten[ll]:
                    in_chan *= input_dim[ll][0] * input_dim[ll][1]
                    data = data.reshape(in_chan, 1, 1)

                out_buf, out_size = conv2d_layer(
                    ll,
                    verbose,
                    False,
                    data.shape,
                    kernel_size[ll],
                    output_shift[ll],
                    output_chan[ll],
                    padding[ll],
                    dilation[ll],
                    stride[ll],
                    activate[ll],
                    kernel[ll].reshape(
                        output_chan[ll],
                        in_chan,
                        kernel_size[ll][0],
                        kernel_size[ll][1],
                    ),
                    bias[ll],
                    data,
                    device=device,
                    debug=debug,
                )
            else:
                out_buf, out_size = conv1d_layer(
                    ll,
                    verbose,
                    False,
                    data.shape,
                    kernel_size[ll][0],
                    output_shift[ll],
                    output_chan[ll],
                    padding[ll][0],
                    dilation[ll][0],
                    stride[ll][0],
                    activate[ll],
                    kernel[ll].reshape(
                        output_chan[ll],
                        in_chan,
                        kernel_size[ll][0]
                    ),
                    bias[ll],
                    data,
                    device=device,
                    debug=debug,
                )
            c_file.write(f'{out_size}\n')

            source = 'input_data' if ll == 0 else buffer0

            if pool[ll][0] > 1 or pool[ll][1] > 1:
                if ll == 0:
                    c_file.write('  memcpy(buffer0, input, input_size);'
                                 ' // Pooling may destroy input\n')
                pool_type = 'ave' if pool_average[ll] else 'max'
                if pool[ll][0] != pool[ll][1]:
                    c_file.write(f'  arm_{pool_type}pool_nonsquare_q7_HWC_nonsquare({buffer0}, '
                                 f'{input_dim[ll][1]}, {input_dim[ll][0]}, '
                                 f'{input_chan[ll]}, {pool[ll][1]}, {pool[ll][0]}, 0, 0, '
                                 f'{pool_stride[ll][1]}, {pool_stride[ll][0]}, '
                                 f'{pooled_dim[ll][1]}, {pooled_dim[ll][0]}, '
                                 f'(q7_t *) col_buffer, {buffer1});\n')
                else:
                    if input_dim[ll][0] == input_dim[ll][1]:
                        c_file.write(f'  arm_{pool_type}pool_q7_HWC({buffer0}, '
                                     f'{input_dim[ll][0]}, {input_chan[ll]}, '
                                     f'{pool[ll][0]}, 0, {pool_stride[ll][0]}, '
                                     f'{pooled_dim[ll][0]}, (q7_t *) col_buffer, {buffer1});\n')
                    else:
                        c_file.write(f'  arm_{pool_type}pool_q7_HWC_nonsquare({buffer0}, '
                                     f'{input_dim[ll][1]}, {input_dim[ll][0]}, '
                                     f'{input_chan[ll]}, {pool[ll][0]}, 0, {pool_stride[ll][0]}, '
                                     f'{pooled_dim[ll][1]}, {pooled_dim[ll][0]}, '
                                     f'(q7_t *) col_buffer, {buffer1});\n')
                source = buffer1
                buffer0, buffer1 = buffer1, buffer0

            # Check for squareness
            if kernel_size[ll][0] == kernel_size[ll][1] \
               and pooled_dim[ll][0] == pooled_dim[ll][1] \
               and output_dim[ll][0] == output_dim[ll][1] \
               and padding[ll][0] == padding[ll][1] \
               and stride[ll][0] == stride[ll][1]:
                fn = 'fast' if input_chan[ll] % 4 == 0 and output_chan[ll] % 2 == 0 else 'basic'
                c_file.write(f'  arm_convolve_HWC_q7_{fn}({source}, '
                             f'{pooled_dim[ll][0]}, '
                             f'{input_chan[ll]}, weights_{ll}, {output_chan[ll]}, '
                             f'{kernel_size[ll][0]}, '
                             f'{padding[ll][0]}, '
                             f'{stride[ll][0]}, '
                             f'bias_{ll}, 0, 7, {buffer1}, '
                             f'{output_dim[ll][0]}, '
                             'col_buffer, NULL);\n')
            else:
                c_file.write(f'  arm_convolve_HWC_q7_basic_nonsquare({source}, '
                             f'{pooled_dim[ll][1]}, {pooled_dim[ll][0]}, '
                             f'{input_chan[ll]}, weights_{ll}, {output_chan[ll]}, '
                             f'{kernel_size[ll][1]}, {kernel_size[ll][0]}, '
                             f'{padding[ll][1]}, {padding[ll][0]}, '
                             f'{stride[ll][1]}, {stride[ll][0]},\n'
                             '                                      '
                             f'bias_{ll}, 0, 7, {buffer1}, '
                             f'{output_dim[ll][1]}, {output_dim[ll][0]}, '
                             'col_buffer, NULL);\n')
            assert out_size[0] == output_chan[ll] \
                and out_size[1] == output_dim[ll][0] and out_size[2] == output_dim[ll][1]

            if activate[ll]:
                size = output_dim[ll][0] * output_dim[ll][1] * output_chan[ll]
                if size < 65536:
                    c_file.write(f'  arm_relu_q7({buffer1}, {size});\n')
                else:
                    c_file.write(f'  arm_relu32_q7({buffer1}, {size});\n')
            buffer0, buffer1 = buffer1, buffer0

            data = out_buf.reshape(out_size)
            c_file.write('\n')
            data_cmsis = data.transpose((1, 2, 0)).flatten()
            if verbose:
                print('TRANSPOSED (HWC) AND FLATTENED:')
                print(data_cmsis)
                print('')

        c_file.write(f'  *output = {buffer0};\n'
                     f'  *output_size = {data_cmsis.size};\n\n'
                     '  return 1;\n}\n\n')

        if fc_weights:
            data = data.flatten()

            out_buf, out_size = linear_layer(
                verbose=verbose,
                verbose_data=False,
                activation=False,
                weight=fc_weights[0],
                bias=fc_bias[0],
                data=data,
                debug=debug
            )

            # Rearrange the weights to account for the shape of the conv layer output
            w = fc_weights[0]. \
                reshape((fc_weights[0].shape[0], output_chan[ll],
                         output_dim[ll][0], output_dim[ll][1])). \
                transpose(0, 2, 3, 1). \
                reshape((fc_weights[0].shape[0], fc_weights[0].shape[1]))

            # np.dot(worg, torg.flatten()) should be equal to np.dot(wnew, tnew.flatten())
            assert (np.dot(fc_weights[0], data) == np.dot(w, data_cmsis)).all()

            toplevel.fc_layer(c_file, weight_header, w, fc_bias[0], cmsis_nn=True)

        c_file.write('int main(void)\n{\n'
                     '  int i;\n'
                     '  q7_t *output;\n'
                     '  int output_size;\n\n'
                     f'  cnn_run(input_data, {input_size}, &output, &output_size);\n\n')

        toplevel.c_define(sampledata_header, data_cmsis, 'OUTPUT_DATA', '%d', 16)
        c_file.write('  if (memcmp(output_data, output, output_size) == 0)\n'
                     '    printf("*** PASS ***\\n\\n");\n'
                     '  else\n'
                     '    printf("!!! FAIL !!!\\n\\n");\n\n')

        if fc_weights:
            c_file.write('  fc_layer(output);\n\n')
            c_file.write('  printf("Classification results:\\n");\n'
                         '  for (i = 0; i < NUM_CLASSES; i++) {\n'
                         '    printf("[%6d] -> Class %d: %0.1f%%\\n", fc_output[i], i, '
                         '(double) (100.0 * ml_softmax[i] / 32768.0));\n'
                         '  }\n\n')
        else:
            c_file.write('  printf("Output of final layer:\\n");\n'
                         '  for (i = 0; i < output_size; i++) {\n'
                         '    printf("%5hhd", (int8_t) (output[i] & 0xff));\n'
                         '    if ((i + 1) % 32 == 0)\n      printf("\\n");\n'
                         '    else if ((i + 1) % 4 == 0)\n      printf(" ");\n'
                         '  }\n'
                         '  printf("\\n");\n'
                         '\n')

        c_file.write('  return 0;\n}\n\n')

    # Close header files
    sampledata_header.close()
    weight_header.close()
Exemple #23
0
def parse(config_file, max_conv=None, device=84):  # pylint: disable=unused-argument
    """
    Configure network parameters from the YAML configuration file `config_file`.
    `max_conv` can be set to force an early termination of the parser.
    `device` is `84`, `85`, etc.
    The function returns both YAML dictionary, the length of the processor map,
    as well as a settings dictionary.
    """
    def error_exit(message, sequence):
        """
        Print error message `message` for layer sequence `sequence` and exit.
        """
        eprint(
            f'{message} (found in layer sequence {sequence} in YAML configuration).'
        )
        sys.exit(1)

    # Load configuration file
    with open(config_file) as cfg_file:
        print(f'Reading {config_file} to configure network...')
        cfg = yaml.load(cfg_file, Loader=UniqueKeyLoader)

    if bool(
            set(cfg) -
            set(['bias', 'dataset', 'layers', 'output_map', 'arch', 'weights'])
    ):
        eprint(f'Configuration file {config_file} contains unknown key(s).')
        sys.exit(1)

    if 'layers' not in cfg or 'arch' not in cfg or 'dataset' not in cfg:
        eprint(f'Configuration file {config_file} does not contain '
               f'`layers`, `arch`, or `dataset`.')
        sys.exit(1)

    # These are initialized with 'None'. Use this to see whether a layer was configured,
    # will be auto-initialized to previous layer's value or a default.
    processor_map = [None] * tc.dev.MAX_LAYERS
    output_map = [None] * tc.dev.MAX_LAYERS
    input_offset = [None] * tc.dev.MAX_LAYERS
    input_chan = [None] * tc.dev.MAX_LAYERS
    input_dim = [None] * tc.dev.MAX_LAYERS
    output_chan = [None] * tc.dev.MAX_LAYERS
    # All other variables are initialized with the default values
    padding = [[1, 1]] * tc.dev.MAX_LAYERS
    pool = [[1, 1]] * tc.dev.MAX_LAYERS
    pooling_enabled = [False] * tc.dev.MAX_LAYERS
    average = [0] * tc.dev.MAX_LAYERS
    pool_stride = [[1, 1]] * tc.dev.MAX_LAYERS
    quantization = [None] * tc.dev.MAX_LAYERS
    bias_quantization = [8] * tc.dev.MAX_LAYERS
    output_shift = [None] * tc.dev.MAX_LAYERS
    output_offset = [0] * tc.dev.MAX_LAYERS
    activation = [None] * tc.dev.MAX_LAYERS
    big_data = [False] * tc.dev.MAX_LAYERS
    output_width = [8] * tc.dev.MAX_LAYERS
    operator = [op.CONV2D] * tc.dev.MAX_LAYERS
    # We don't support changing the following (yet), but leave as parameters:
    dilation = [[1, 1]] * tc.dev.MAX_LAYERS
    kernel_size = [DEFAULT_2D_KERNEL] * tc.dev.MAX_LAYERS
    conv_groups = [1] * tc.dev.MAX_LAYERS
    stride = [[1, 1]] * tc.dev.MAX_LAYERS
    streaming = [False] * tc.dev.MAX_LAYERS
    flatten = [False] * tc.dev.MAX_LAYERS
    operands = [1] * tc.dev.MAX_LAYERS
    eltwise = [op.NONE] * tc.dev.MAX_LAYERS
    pool_first = [True] * tc.dev.MAX_LAYERS
    in_sequences = [None] * tc.dev.MAX_LAYERS
    write_gap = [0] * tc.dev.MAX_LAYERS

    sequence = 0
    for ll in cfg['layers']:
        if bool(
                set(ll) - set([
                    'max_pool', 'avg_pool', 'convolution', 'conv_groups',
                    'in_channels', 'in_dim', 'in_sequences', 'in_offset',
                    'kernel_size', 'pool_stride', 'out_channels', 'out_offset',
                    'activate', 'activation', 'data_format', 'eltwise',
                    'flatten', 'op', 'operands', 'operation', 'operator',
                    'output_processors', 'output_width', 'output_shift',
                    'pool_first', 'processors', 'pad', 'quantization',
                    'sequence', 'streaming', 'stride', 'write_gap'
                ])):
            eprint(
                f'Configuration file {config_file} contains unknown key(s) for `layers`.'
            )
            sys.exit(1)

        if 'sequence' in ll:
            sequence = ll['sequence']  # Override sequence information

        if processor_map[sequence]:
            error_exit('Layer was already specified', sequence)
        if 'processors' in ll:
            processor_map[sequence] = ll['processors']
        if not processor_map[sequence]:
            error_exit('`processors` must not be zero or missing', sequence)
        if not isinstance(processor_map[sequence], int) \
           or processor_map[sequence] >= 2**tc.dev.MAX_PROC:
            error_exit(
                f'`processors` must be an int from 0 to 2**{tc.dev.MAX_PROC}-1',
                sequence)

        if 'output_processors' in ll:
            output_map[sequence] = ll['output_processors']
            if not output_map[sequence]:
                error_exit('output_processors` cannot be zero', sequence)
            if not isinstance(output_map[sequence], int) \
               or output_map[sequence] >= 2**tc.dev.MAX_PROC:
                error_exit(
                    '`output_processors` must be an int from 0 to '
                    f'2**{tc.dev.MAX_PROC}-1', sequence)

        if 'max_pool' in ll:
            val = ll['max_pool']
            if not isinstance(val, list):
                pool[sequence] = [val, val]
            else:
                pool[sequence] = val
            pooling_enabled[sequence] = True
        elif 'avg_pool' in ll:
            val = ll['avg_pool']
            if not isinstance(val, list):
                pool[sequence] = [val, val]
            else:
                pool[sequence] = val
            pooling_enabled[sequence] = True
            average[sequence] = 1

        if 'pool_stride' in ll:
            val = ll['pool_stride']
            if not isinstance(val, list):
                pool_stride[sequence] = [val, val]
            else:
                pool_stride[sequence] = val

        if 'quantization' in ll:
            val = ll['quantization']
            if val not in [1, 2, 4, 8]:
                error_exit('`quantization` must be 1, 2, 4, or 8', sequence)
            quantization[sequence] = val

        if 'output_shift' in ll:
            val = ll['output_shift']
            output_shift[sequence] = val
            # The implicit shift for quantization is added later

        if 'in_channels' in ll:
            input_chan[sequence] = ll['in_channels']
        if 'in_dim' in ll:
            if isinstance(ll['in_dim'], list) and len(ll['in_dim']) > 2:
                error_exit('`in_dim` must not exceed two dimensions', sequence)
            input_dim[sequence] = ll['in_dim']
        if 'in_offset' in ll:
            input_offset[sequence] = ll['in_offset']
        if 'out_channels' in ll:
            output_chan[sequence] = ll['out_channels']
        if 'out_offset' in ll:
            output_offset[sequence] = ll['out_offset']
        else:
            print('WARNING: Defaulting to `out_offset = 0` for '
                  f'layer sequence {sequence} in YAML configuration.')

        if 'activate' in ll or 'activation' in ll:
            key = 'activate' if 'activate' in ll else 'activation'
            if ll[key].lower() == 'relu':
                activation[sequence] = op.ACT_RELU
            elif ll[key].lower() == 'abs':
                activation[sequence] = op.ACT_ABS
            elif ll[key].lower() == 'none':
                activation[sequence] = None
            else:
                error_exit(f'Unknown value "{ll[key]}" for `{key}`', sequence)
                sys.exit(1)

        if 'convolution' in ll or 'operation' in ll or 'op' in ll or 'operator' in ll:
            key = 'convolution' if 'convolution' in ll else \
                  'operation' if 'operation' in ll else \
                  'operator' if 'operator' in ll else \
                  'op'
            conv = ll[key].lower()
            if conv == 'conv1d':
                operator[sequence] = op.CONV1D
            elif conv == 'conv2d':
                operator[sequence] = op.CONV2D
            elif conv == 'convtranspose2d':
                operator[sequence] = op.CONVTRANSPOSE2D
            elif conv in ['none', 'passthrough']:
                operator[sequence] = op.NONE
                padding[sequence] = [0, 0]
            elif conv == 'add':
                operator[sequence] = op.NONE
                eltwise[sequence] = op.ELTWISE_ADD
                operands[sequence] = 2
                padding[sequence] = [0, 0]
            elif conv == 'or':
                operator[sequence] = op.NONE
                eltwise[sequence] = op.ELTWISE_OR
                operands[sequence] = 2
                padding[sequence] = [0, 0]
            elif conv == 'sub':
                operator[sequence] = op.NONE
                eltwise[sequence] = op.ELTWISE_SUB
                operands[sequence] = 2
                padding[sequence] = [0, 0]
            elif conv == 'xor':
                operator[sequence] = op.NONE
                eltwise[sequence] = op.ELTWISE_XOR
                operands[sequence] = 2
                padding[sequence] = [0, 0]
            elif conv in ['linear', 'fc', 'mlp']:
                # Emulate using Conv2D with 1x1 kernels and 1x1 data
                operator[sequence] = op.CONV2D
                kernel_size[sequence] = FC_KERNEL
                padding[sequence] = [0, 0]
            else:
                error_exit(f'Unknown value "{ll[key]}" for `{key}`', sequence)
                sys.exit(1)
        else:
            print('WARNING: Defaulting to `op: Conv2d` for '
                  f'layer sequence {sequence} in YAML configuration.')

        if 'pad' in ll:
            val = ll['pad']
            if val < 0:
                error_exit(f'Unsupported value {val} for `pad`', sequence)
            padding[sequence] = [val, val]

        if 'eltwise' in ll:
            conv = ll['eltwise'].lower()
            if conv == 'add':
                eltwise[sequence] = op.ELTWISE_ADD
                operands[sequence] = 2
            elif conv == 'or':
                eltwise[sequence] = op.ELTWISE_OR
                operands[sequence] = 2
            elif conv == 'sub':
                eltwise[sequence] = op.ELTWISE_SUB
                operands[sequence] = 2
            elif conv == 'xor':
                eltwise[sequence] = op.ELTWISE_XOR
                operands[sequence] = 2
            else:
                error_exit(f'Unknown value "{ll["eltwise"]}" for `eltwise`',
                           sequence)
                sys.exit(1)

        if 'pool_first' in ll:
            val = ll['pool_first']
            try:
                pool_first[sequence] = bool(val)
            except ValueError:
                error_exit(f'Unsupported value `{val}` for `pool_first`',
                           sequence)

        if 'operands' in ll:
            if not op.eltwise(eltwise[sequence]):
                error_exit(
                    '`operands` can only be used with element-wise operations',
                    sequence)
            val = ll['operands']
            if val < 2 or val > 16:
                error_exit('`operands` has to be 2..16', sequence)
            operands[sequence] = val

        if 'data_format' in ll:
            if sequence:
                error_exit(
                    '`data_format` can only be configured for the first layer',
                    sequence)

            val = ll['data_format'].lower()
            if val in ['chw', 'big']:
                big_data[sequence] = True
            elif val in ['hwc', 'little']:
                pass
            else:
                error_exit('Unknown value for `data_format`', sequence)

        if 'output_width' in ll:
            val = ll['output_width']
            if val not in [8, 32]:
                error_exit('`output_width` must be 8 or 32', sequence)
            output_width[sequence] = val

        if 'kernel_size' in ll:
            if kernel_size[sequence] != DEFAULT_2D_KERNEL:
                error_exit(
                    'Cannot configure `kernel_size` for fully connected layers',
                    sequence)

            val = str(ll['kernel_size']).lower()
            if operator[sequence] == op.CONV2D:
                if device == 84 and val not in ['3x3'] \
                        or device != 84 and val not in ['1x1', '3x3']:
                    error_exit(f'Unsupported value `{val}` for `kernel_size`',
                               sequence)
                kernel_size[sequence] = [int(val[0]), int(val[2])]
            elif operator[sequence] == op.CONVTRANSPOSE2D:
                if val not in ['3x3']:
                    error_exit(f'Unsupported value `{val}` for `kernel_size`',
                               sequence)
                kernel_size[sequence] = [int(val[0]), int(val[2])]
            else:
                try:
                    val = int(val)
                except ValueError:
                    error_exit(f'Unsupported value `{val}` for `kernel_size`',
                               sequence)
                if device == 84 and val != 9 or val < 1 or val > 9:
                    error_exit(f'Unsupported value `{val}` for `kernel_size`',
                               sequence)
                kernel_size[sequence] = [val, 1]
        elif operator[sequence] == op.CONV1D:  # Set default for 1D convolution
            kernel_size[sequence] = DEFAULT_1D_KERNEL

        if 'stride' in ll:
            val = ll['stride']
            if pooling_enabled[sequence]:
                # Must use the default stride when pooling, otherwise stride can be set
                if operator[sequence] == op.CONV2D and val != 1 \
                   or (device == 84 and val != 3 or val != 1):
                    error_exit(
                        'Cannot set `stride` to non-default value when pooling',
                        sequence)
            else:
                if operator[sequence] == op.CONVTRANSPOSE2D and val != 2:
                    error_exit(
                        'Cannot set `stride` to non-default value for ConvTranspose2D',
                        sequence)
                # Stride can be set
                stride[sequence] = [val, val]

        if 'streaming' in ll:
            val = ll['streaming']
            try:
                streaming[sequence] = bool(val)
            except ValueError:
                error_exit(f'Unsupported value `{val}` for `streaming`',
                           sequence)

        if 'flatten' in ll:
            val = ll['flatten']
            try:
                flatten[sequence] = bool(val)
            except ValueError:
                error_exit(f'Unsupported value `{val}` for `flatten`',
                           sequence)

        if 'in_sequences' in ll:
            if isinstance(ll['in_sequences'], list):
                if any([(i >= sequence) for i in ll['in_sequences']]):
                    error_exit(
                        '`in_sequences` cannot be greater than layer sequence',
                        sequence)
            elif ll['in_sequences'] >= sequence:
                error_exit(
                    '`in_sequences` cannot be greater than layer sequence',
                    sequence)
            in_sequences[sequence] = ll['in_sequences']

        if 'conv_groups' in ll:
            conv_groups[sequence] = ll['conv_groups']

        if 'write_gap' in ll:
            write_gap[sequence] = ll['write_gap']

        # Fix up values for 1D convolution or no convolution
        if operator[sequence] == op.CONV1D:
            padding[sequence][1] = 0
            pool[sequence][1] = 1
            pool_stride[sequence][1] = 1
            stride[sequence][1] = 1
        elif operator[sequence] == op.NONE:
            kernel_size[sequence] = [1, 1]
        elif operator[sequence] == op.CONVTRANSPOSE2D:
            stride[sequence] = [2, 2]

        # Check for early exit
        if max_conv is not None:
            if max_conv == 0:
                if output_map[sequence] is None and (len(cfg['layers']) >
                                                     sequence + 1):
                    if 'processors' in cfg['layers'][sequence + 1]:
                        output_map[sequence] = cfg['layers'][sequence +
                                                             1]['processors']
                break
            max_conv -= 1

        sequence += 1

    # Sequence specification may have holes. Contract to the used layers.
    for ll in range(tc.dev.MAX_LAYERS - 1, -1, -1):
        if processor_map[ll] is None:
            del processor_map[ll]
            del padding[ll]
            del pool[ll]
            del pool_stride[ll]
            del input_chan[ll]
            del input_dim[ll]
            del input_offset[ll]
            del output_chan[ll]
            del output_offset[ll]
            del average[ll]
            del activation[ll]
            del big_data[ll]
            del quantization[ll]
            del bias_quantization[ll]
            del output_shift[ll]
            del output_map[ll]
            del output_width[ll]
            del operator[ll]
            del dilation[ll]
            del kernel_size[ll]
            del stride[ll]
            del pooling_enabled[ll]
            del streaming[ll]
            del flatten[ll]
            del operands[ll]
            del eltwise[ll]
            del conv_groups[ll]
            del write_gap[ll]

    # Check all but last layer
    for ll in range(len(output_map) - 1):
        if output_width[ll] != 8:
            error_exit('`output_width` is not 8 for intermediate layer', ll)
        # Fix up default output maps
        if output_map[ll] is None:
            output_map[ll] = processor_map[ll + 1]

    # Check all but first layer
    for ll in range(1, len(input_offset)):
        # Fix up default input maps
        if input_offset[ll] is None:
            input_offset[ll] = output_offset[ll - 1]
        # Check we don't turn on streaming too late
        if streaming[ll] and not streaming[ll - 1]:
            error_exit('Enable streaming from the first layer on', ll)
    # Check first layer
    if input_offset[0] is None:
        input_offset[0] = 0
    # Check last layer
    if output_map[-1] is None and 'output_map' in cfg:
        output_map[-1] = cfg['output_map']
    if output_width[-1] != 8 and activation[-1] is not None:
        error_exit('`output_width` must be 8 when activation is used',
                   len(activation))

    # Check all layers
    for ll, e in enumerate(operator):
        # Check that pass-through does not use activation
        if e == op.NONE:
            if activation[ll] is not None:
                error_exit('Pass-through layers must not use activation', ll)
            if padding[ll][0] != 0 or padding[ll][1] != 0:
                error_exit('Padding must be zero for passthrough layers', ll)
        # Check that pooling isn't set for ConvTranspose2d:
        elif e == op.CONVTRANSPOSE2D:
            if pooling_enabled[ll]:
                error_exit('ConvTranspose2d cannot be used with pooling', ll)
        # Check that element-wise does not use Conv1d
        if e == op.CONV1D and operands[ll] > 1:
            error_exit(
                'Element-wise operations cannot be combined with Conv1d', ll)
        if not pool_first[ll] and (operands[ll] == 1
                                   or pool[ll][0] == 1 and pool[ll][1] == 1):
            error_exit(
                '`pool_first: False` requires both pooling and element-wise operations',
                ll)

    if device == 84:
        # Fix up defaults for Conv1D:
        for ll, e in enumerate(operator):
            if e == op.CONV1D:
                kernel_size[ll] = [9, 1]

    settings = {}
    settings['padding'] = padding
    settings['pool'] = pool
    settings['pooling_enabled'] = pooling_enabled
    settings['pool_stride'] = pool_stride
    settings['input_chan'] = input_chan
    settings['input_dim'] = input_dim
    settings['input_offset'] = input_offset
    settings['output_chan'] = output_chan
    settings['output_offset'] = output_offset
    settings['processor_map'] = processor_map
    settings['average'] = average
    settings['activation'] = activation
    settings['big_data'] = big_data
    settings['quantization'] = quantization
    settings['bias_quantization'] = bias_quantization
    settings['output_shift'] = output_shift
    settings['output_processor_map'] = output_map
    settings['output_width'] = output_width
    settings['operator'] = operator
    settings['dilation'] = dilation
    settings['kernel_size'] = kernel_size
    settings['stride'] = stride
    settings['streaming'] = streaming
    settings['flatten'] = flatten
    settings['operands'] = operands
    settings['eltwise'] = eltwise
    settings['pool_first'] = pool_first
    settings['in_sequences'] = in_sequences
    settings['conv_groups'] = conv_groups
    settings['write_gap'] = write_gap

    return cfg, len(processor_map), settings
Exemple #24
0
def load(
    checkpoint_file,
    unused_arch,
    fc_layer,
    quantization,
    bias_quantization,
    output_shift,
    kernel_size,  # this information available in onnx model
    operator,
    verbose=False,
    no_bias=None,
):
    """
    Load weights and biases from `checkpoint_file`. If `arch` is not None and does not match
    the architecuture in the checkpoint file, abort with an error message. If `fc_layer` is
    `True`, configure a single fully connected classification layer for software rather than
    hardware.
    `quantization` is a list of expected bit widths for the layer weights (always 8 for AI84).
    This value is checked against the weight inputs.
    `bias_quantization` is a list of the expected bit widths for the layer weights (always
    8 for AI84/AI85).
    In addition to returning weights anf biases, this function configures the network output
    channels and the number of layers.
    When `verbose` is set, display the shapes of the weights.
    """
    model = onnx.load(checkpoint_file)
    print(f'Reading {checkpoint_file} to configure network weights...')

    layers = 0
    num_conv_layers = len(quantization)
    no_bias = no_bias or []
    weights = []
    bias = []
    fc_weights = []
    fc_bias = []
    weight_keys = []
    bias_keys = []
    output_channels = []
    input_channels = []
    param_count = 0
    param_size = 0
    error_exit = False
    quant = []
    bias_quant = []
    weight_min = []
    weight_max = []
    weight_size = []
    bias_min = []
    bias_max = []
    bias_size = []
    seq = 0

    kernel_size_onnx = []

    initializers = {t.name for t in model.graph.initializer}
    for _, node in enumerate(model.graph.node):

        if node.op_type == 'Conv' or node.op_type == 'Gemm':
            _inputs, _outputs = get_inouts(node)
            for _input in _inputs:
                w = process_channels(model, _input, initializers)
                if w is not None:
                    if node.op_type == 'Gemm':  # general matrix multiplication (FC layer)
                        kernel_shape = [1, 1]
                        kernel_size_onnx.append(kernel_shape)
                        if layers >= num_conv_layers:
                            continue
                        if fc_layer:
                            if _input == _inputs[1]:  # weight
                                assert w.min() >= -128 and w.max() <= 127
                                fc_weights.append(w)

                            if len(_inputs) == 3:  # have optional bias input
                                if _input == _inputs[2]:  # bias
                                    assert w.min() >= -128 and w.max() <= 127
                                    fc_bias.append(w)
                            elif _input == _inputs[1]:  # add bias 'None'
                                fc_bias.append(
                                    None)  # during weight input processing

                    if node.op_type == 'Conv':  # (Conv layer)
                        for a in node.attribute:
                            if a.name == 'kernel_shape':
                                kernel_size_onnx.append(a.ints)

                    if len(w.shape) > 1:  # not a bias
                        quant.append(quantization[seq])

                        w_min, w_max = w.min(), w.max()

                        # Determine quantization or make sure that what was given fits
                        if quantization[seq] is not None:
                            assert w_min >= -(2**(quantization[seq] -
                                                  1)), print(w_min)
                            assert w_max < 2**(quantization[seq] -
                                               1), print(w_max)
                        else:
                            if w_max > 0:
                                w_max_m = int(w_max)
                            else:
                                w_max_m = int(abs(w_max)) - 1
                            if w_min > 0:
                                w_min_m = int(w_min)
                            else:
                                w_min_m = int(abs(w_min)) - 1
                            quantization[seq] = 1 << (
                                fls(max(fls(w_max_m), fls(w_min_m)) + 1) + 1)
                            assert quantization[seq] <= 8

                        weight_min.append(w_min)
                        weight_max.append(w_max)

                        # Not overriding output_shift?
                        if output_shift[seq] is None:
                            output_shift[seq] = 0
                        # Add based on quantization
                        output_shift[seq] += 8 - quantization[seq]

                        # TODO: Double check if we need to check conv2d if opn is known
                        # to be opn.CONVTRANSPOSE2D. We should be able to get this
                        # from the op_type Conv plus shape?
                        if operator[seq] == opn.CONVTRANSPOSE2D:
                            # For ConvTranspose2d, flip the weights as follows:
                            w = np.flip(w, axis=(2, 3)).swapaxes(0, 1)

                        input_channels.append(w.shape[1])  # Input channels
                        output_channels.append(w.shape[0])  # Output channels

                        if len(w.shape) == 2:  # MLP
                            if kernel_size_onnx[seq][
                                    0] != 1 or kernel_size_onnx[seq][1] != 1:
                                eprint(
                                    f'The `kernel_size` for the MLP layer {seq} should '
                                    f'be set to 1x1 instead of '
                                    f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.'
                                )
                                error_exit = True
                        elif len(w.shape) == 3:  # 1D
                            if kernel_size_onnx[seq][0] != w.shape[2] \
                               or kernel_size_onnx[seq][1] != 1:
                                eprint(
                                    f'The `kernel_size` for the 1D layer {seq} should '
                                    f'be set to {w.shape[2]}x1 instead of '
                                    f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.'
                                )
                                error_exit = True
                        elif len(w.shape) == 4:  # 2D
                            if kernel_size_onnx[seq][0] != w.shape[2] \
                               or kernel_size_onnx[seq][1] != w.shape[3]:
                                eprint(
                                    f'The `kernel_size` for the 2D layer {seq} should '
                                    f'be set to {w.shape[2]}x{w.shape[3]} instead of '
                                    f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.'
                                )
                                error_exit = True

                        w_count = np.prod(w.shape)
                        param_count += w_count
                        w_size = (w_count * quantization[seq] + 7) // 8
                        weight_size.append(w_size)
                        param_size += w_size

                        if len(w.shape) == 2:  # linear - add dummy 'channel'
                            w = np.expand_dims(w, axis=0)
                        else:  # conv1d, conv2d, ... - combine input and output channels
                            w = np.reshape(w, (-1, ) + w.shape[2:])

                        weights.append(w)
                        weight_keys.append(_input)

                    if len(_inputs) < 3 or \
                       (_input == _inputs[2] and seq in no_bias):  # no bias input
                        bias.append(None)
                        bias_min.append(0)
                        bias_max.append(0)
                        bias_keys.append('N/A')
                        bias_quant.append(0)
                        bias_size.append(0)
                    elif _input == _inputs[2]:  # bias input
                        w = w // tornadocnn.dev.BIAS_DIV
                        w_min, w_max = w.min(), w.max()
                        assert w_min >= -(2**(bias_quantization[seq] - 1))
                        assert w_max < 2**(bias_quantization[seq] - 1)
                        bias_min.append(w_min)
                        bias_max.append(w_max)

                        bias.append(w)
                        bias_keys.append(_input)
                        bias_quant.append(bias_quantization[seq])
                        w_count = np.prod(w.shape)
                        param_count += w_count
                        w_size = (w_count * 8 + (bias_quantization[seq] -
                                                 1)) // bias_quantization[seq]
                        bias_size.append(w_size)
                        param_size += w_size

            seq += 1
            layers += 1
        # TODO: Things to add
        # if attribute.name == 'pads':
        # if attribute.name == 'strides':

    if verbose:
        print(
            'Layer  InCh OutCh  Weights         Quant  Min Max   Size '
            'Key                                 Bias       Quant  Min Max Size Key'
        )
        for ll in range(layers):
            if ll < len(weights) and weights[ll] is not None:
                weight_shape = str(weights[ll].shape)
                if bias[ll] is not None:
                    bias_shape = str(bias[ll].shape)
                else:
                    bias_shape = 'N/A'
                print(
                    f'{ll:4}: '
                    f'{input_channels[ll]:5} {output_channels[ll]:5}  '
                    f'{weight_shape:15} '
                    f'{quant[ll]:5} {weight_min[ll]:4} {weight_max[ll]:3} {weight_size[ll]:6} '
                    f'{weight_keys[ll]:35} '
                    f'{bias_shape:10} '
                    f'{bias_quant[ll]:5} {bias_min[ll]:4} {bias_max[ll]:3} {bias_size[ll]:4} '
                    f'{bias_keys[ll]:25}')
        print(
            f'TOTAL: {layers} layers, {param_count:,} parameters, {param_size:,} bytes'
        )

    if error_exit:
        sys.exit(1)

    if verbose:
        with np.printoptions(threshold=np.inf, linewidth=80):
            print("\nSUMMARY\n=======")
            print(layers, "layers\n")
            print("weights:")
            print(weights)
            print("bias:")
            print(bias)
            print("fc_weights:")
            print(fc_weights)
            print("fc_bias:")
            print(fc_bias)
            print("input_channels:")
            print(input_channels)
            print("output_channels:")
            print(output_channels)
            print("")

    return layers, weights, bias, output_shift, \
        fc_weights, fc_bias, input_channels, output_channels
Exemple #25
0
def extract_funct_alt(funct_lines, funct_name, starting_line_num):
    """
    Constructs a cloned function from an array of code lines. 
    """
    start_line_num = 0
    call_list = ["call", "callf", "callq"]
    returns = ["ret", "retf", "iret", "retq", "iretq"]
    jmp_list = [
        "jo", "jno", "jb", "jnae", "jc", "jnb", "jae", "jnc", "jz", "je",
        "jnz", "jne", "jbe", "jna", "jnbe", "ja", "js", "jns", "jp", "jpe",
        "jnp", "jpo", "jl", "jnge", "jnl", "jge", "jle", "jng", "jnle", "jg",
        "jecxz", "jrcxz", "jmp", "jmpe"
    ]
    CALL_SITE, RETURN_SITE, INDIR_JMP_SITE, PLT_SITE, = 0, 1, 2, 3

    comment_continues = False
    sites = []
    direct_call_sites = []
    empty_ret_dict = dict()
    line_num = starting_line_num

    for asm_line in funct_lines:
        asm_parsing.update_dwarf_loc(asm_line, dwarf_loc)
        try:
            first_word = asm_line.split()[0]
        except IndexError:
            # ignore empty line
            asm_line = asm_file.readline()
            line_num += 1
            continue
        if first_word[:len('.LFE')] == '.LFE':
            break
        else:
            targets = []
            labels, key_symbol, arg_str, comment_continues = (
                asm_parsing.decode_line(asm_line, comment_continues))

        if key_symbol in call_list:
            new_site = funct_cfg.Site(line_num, targets, CALL_SITE, dwarf_loc)
            if '%' not in arg_str:
                new_site.targets.append(arg_str)
                direct_call_sites.append(new_site)
            sites.append(new_site)
        elif key_symbol in returns:
            # empty return dict passed so that every site's return dict is
            # a reference to the function's return dict
            sites.append(
                funct_cfg.Site(line_num, empty_ret_dict, RETURN_SITE,
                               dwarf_loc))
        elif key_symbol in jmp_list:
            if '%' in arg_str:
                sites.append(
                    funct_cfg.Site(line_num, targets, INDIR_JMP_SITE,
                                   dwarf_loc))
        line_num += 1
    else:
        eprint(
            dwarf_loc.filename() + ':' + ' ' + ':' + str(start_line_num) +
            ' error: unterminated function: ', funct_name)

    src_filename = dwarf_loc.filename()
    new_funct = funct_cfg.Function(funct_name, ' ', src_filename, sites,
                                   starting_line_num)
    new_funct.direct_call_sites = direct_call_sites
    new_funct.ret_dict = empty_ret_dict
    return new_funct, line_num
Exemple #26
0
 def filename(self):
     if self.filenum not in self._filename_dict:
         eprint('warning: undefined filenumber: ' + str(self.filenum))
         return '?'
     return self._filename_dict[self.filenum]
Exemple #27
0
def generateDicts(sock):
    severityMap = {
        "0": "emerg",
        "1": "alert",
        "2": "crit",
        "3": "err",
        "4": "warning",
        "5": "notice",
        "6": "info",
        "7": "debug"
    }

    facilityMap = {
        "0": "kernel",
        "1": "user",
        "2": "mail",
        "3": "system",
        "4": "auth",
        "5": "syslog",
        "6": "lpd",
        "7": "news",
        "8": "uucp",
        "9": "time",
        "10": "auth",
        "11": "ftp",
        "12": "ntp",
        "13": "logaudit",
        "14": "logalert",
        "15": "clock",
        "16": "local0",
        "17": "local1",
        "18": "local2",
        "19": "local3",
        "20": "local4",
        "21": "local5",
        "22": "local6",
        "23": "local7"
    }

    skip = 0
    skipcount = 0
    ssec = datetime.utcnow().strftime("%S")
    yieldcount = 0

    # Compile regex patterns for iteration on each component of the message
    pats = {}

    pristrings = [r'^<(?P<pri>\d{1,3})>(\d*:?)?']
    pats['pri'] = []
    for i in pristrings:
        pats['pri'].append(re.compile(i + r'(?P<space>\s?)\S+'))

    # Date/time
    datestrings = [
        r'(?P<date>[A-Za-z]+ [ \d]?\d \d\d:\d\d:\d\d( [A-Z]{3}:)?)',
        r'(?P<date>\d{4} [A-Za-z]+ [ \d]?\d \d\d:\d\d:\d\d( [A-Z]{3}:)?)',
        r'(?P<date>\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d\.\d{3}Z)'
    ]
    pats['date'] = []
    for i in datestrings:
        pats['date'].append(re.compile(i + r'(?P<space>\s+)\S+'))

    # Host/IP
    hoststrings = [
        r'(?P<host>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})',
        r'(?P<host>[a-z0-9_-]+(\.[a-z0-9_-]+)*(\.[a-z]+[0-9]?))',
        r'(?P<host>[a-z0-9_-]+)'
    ]
    pats['host'] = []
    for i in hoststrings:
        pats['host'].append(re.compile(i + r'(?P<space>\s+)\S+',
                                       re.IGNORECASE))

    # Rest of line
    pats['message'] = [re.compile(r'(?P<message>.*)(?P<space>\s*)$')]

    currentDict = {}

    a10 = Device.A10('logs')
    arista = Device.Arista('logs')
    brocade = Device.Brocade('logs')
    f5 = Device.F5('logs')
    force10 = Device.Force10('logs')
    juniper = Device.Juniper('logs')
    linux = Device.Linux('logs')

    pktbuf = []
    pktbuf_peak = 0
    packets = 0
    processed = 0

    while True:
        try:
            data, (src_ip, port) = sock.recvfrom(8192)
            pktbuf.append((data, src_ip, port))
            packets += 1
        except socket.error:
            if int(datetime.utcnow().strftime("%S")) != ssec:
                ssec = int(datetime.utcnow().strftime("%S"))
                eprint(
                    "%28s Messages in buffer (peak): %d read: %d processed: %d yielded: %d skipped: %d"
                    % (str(datetime.utcnow()), pktbuf_peak, packets, processed,
                       yieldcount, skipcount))
                pktbuf_peak = 0

            if len(pktbuf) == 0:
                continue
            else:
                buf_len = len(pktbuf)
                if buf_len > pktbuf_peak:
                    pktbuf_peak = buf_len

            while pktbuf:
                line, src_ip, port = pktbuf.pop(0)
                processed += 1

                # We can safely init the dict here because multiline messages are
                # still contained within single datagrams
                currentDict = {}

                # Pristine copy of what we received
                currentDict['raw_message'] = line

                # Strip any leading junk
                if line[0] == '\0':
                    line = line.lstrip('\r\n\0 ')

                for pname in ['pri', 'date', 'host', 'message']:
                    for p in pats[pname]:
                        matched = p.match(line)

                        if matched:
                            if pname == 'pri':
                                currentDict['severity_int'] = str(
                                    int(matched.group('pri')) & 7)
                                currentDict['facility_int'] = str(
                                    int(matched.group('pri')) >> 3 & 23)
                                currentDict['severity_label'] = severityMap[
                                    currentDict['severity_int']]
                                currentDict['facility_label'] = facilityMap[
                                    currentDict['facility_int']]

                            currentDict[pname] = matched.group(pname)

                            # Trim the line up to the ending space from the last match
                            line = line[matched.end('space'):]

                            # We matched this element so no need to keep looping on it
                            break

                    # None of the patterns matched for this field
                    if pname not in currentDict:
                        eprint("Did not match for %s: %s" % (pname, line))

                # Chop off any remaining crap
                line = line.rstrip()

                # Finished parsing but did not consume the whole line (should never happen)
                if len(line) > 0:
                    eprint("still some line left: [%s]" % line)

                # Did not match anything at all?
                if currentDict == {}:
                    eprint("matched nothing: [%s]" % line)
                    skipcount += 1
                    continue

                else:
                    skip = 0
                    vendor = None
                    #currentDict['fromhost'] = resolveHostname(src_ip)
                    currentDict['fromhost'] = src_ip
                    currentDict['fromhost-ip'] = src_ip
                    if 'host' not in currentDict:
                        currentDict['host'] = currentDict['fromhost'].lower()
                    else:
                        currentDict['host'] = currentDict['host'].lower()

                    try:
                        if currentDict['host'].find('v-') >= 0 and currentDict[
                                'host'].find('-net') >= 7:
                            vendor = linux

                        elif currentDict['host'].find(
                                'bar') == 0 or currentDict['host'].find(
                                    'bcr') == 0 or currentDict['host'].find(
                                        'scr'
                                    ) == 0 or currentDict['host'].find(
                                        'sff'
                                    ) == 0 or currentDict['host'].find(
                                        'mfw'
                                    ) == 0 or currentDict['host'].find(
                                        're') == 0 or currentDict['host'].find(
                                            'bmr'
                                        ) == 0 or currentDict['host'].find(
                                            'fw'
                                        ) == 0 or currentDict['host'].find(
                                            'r1') == 0 or currentDict[
                                                'host'].find('r2') == 0:
                            vendor = juniper

                        elif currentDict['host'].find(
                                'ma') == 0 or currentDict['host'].find(
                                    'trr') == 0 or currentDict['host'].find(
                                        'spr'
                                    ) == 0 or currentDict['host'].find(
                                        'ssr') == 0 or currentDict[
                                            'host'].find('ser') == 0:
                            vendor = arista

                        elif currentDict['host'].find(
                                'slb') == 0 or currentDict['host'].find(
                                    'mlb') == 0 or currentDict['host'].find(
                                        'glb') == 0 or currentDict[
                                            'host'].find('vpr') == 0:
                            vendor = a10

                        elif currentDict['host'].find('lb') == 0:
                            vendor = f5

                        elif currentDict['host'].find('sw') == 0:
                            vendor = brocade

                        elif currentDict['host'].find('10.1') == 0:
                            vendor = force10

                        if vendor:
                            currentDict['vendor'] = vendor.vendor
                            if not vendor.matchLogPattern(currentDict):
                                eprint(
                                    "Did not match %s message for host %s: %s"
                                    % (vendor.vendor, currentDict['host'],
                                       currentDict['message']))
                                # Flag as unmatched message
                                currentDict['state'] = 5

                        else:
                            eprint(
                                "Did not match host pattern for host: %s  message: %s"
                                %
                                (currentDict['host'], currentDict['message']))

                    except KeyError:
                        eprint("Field not found:", currentDict)

                        skip = 1
                        skipcount += 1

                    if skip == 0:
                        yield (currentDict)
                        yieldcount += 1
def load(
    checkpoint_file,
    arch,
    fc_layer,
    quantization,
    bias_quantization,
    output_shift,
    kernel_size,
    operator,
    verbose=False,
    no_bias=None,
    conv_groups=None,
):
    """
    Load weights and biases from `checkpoint_file`. If `arch` is not None and does not match
    the architecuture in the checkpoint file, abort with an error message. If `fc_layer` is
    `True`, configure a single fully connected classification layer for software rather than
    hardware.
    `quantization` is a list of expected bit widths for the layer weights (always 8 for AI84).
    This value is checked against the weight inputs.
    `bias_quantization` is a list of the expected bit widths for the layer weights (always
    8 for AI84/AI85).
    In addition to returning weights anf biases, this function configures the network output
    channels and the number of layers.
    When `verbose` is set, display the shapes of the weights.
    """
    no_bias = no_bias or []
    weights = []
    bias = []
    fc_weights = []
    fc_bias = []
    weight_keys = []
    bias_keys = []
    quant = []
    bias_quant = []
    weight_min = []
    weight_max = []
    weight_size = []
    bias_min = []
    bias_max = []
    bias_size = []

    checkpoint = torch.load(checkpoint_file, map_location='cpu')
    print(f'Reading {checkpoint_file} to configure network weights...')

    if 'state_dict' not in checkpoint or 'arch' not in checkpoint:
        raise RuntimeError("\nNo `state_dict` or `arch` in checkpoint file.")

    if arch and checkpoint['arch'].lower() != arch.lower():
        eprint(
            f"Network architecture of configuration file ({arch}) does not match "
            f"network architecture of checkpoint file ({checkpoint['arch']}).")
        sys.exit(1)

    checkpoint_state = checkpoint['state_dict']
    layers = 0
    num_conv_layers = len(quantization)
    have_fc_layer = False
    output_channels = []
    input_channels = []
    param_count = 0
    param_size = 0
    error_exit = False
    seq = 0

    for _, k in enumerate(checkpoint_state.keys()):
        # Skip over non-weight layers
        while seq < len(operator) and operator[seq] == opn.NONE:
            seq += 1

        operation, parameter = k.rsplit(sep='.', maxsplit=1)
        if parameter in ['weight']:
            module, op = k.split(sep='.', maxsplit=1)
            op = op.rsplit(sep='.', maxsplit=1)[0]
            if module != 'fc' or module == 'fc' and not fc_layer:
                if layers >= num_conv_layers or seq >= num_conv_layers:
                    continue

                w = checkpoint_state[k].numpy().astype(np.int64)
                w_min, w_max = w.min(), w.max()

                # Determine quantization or make sure that what was given fits
                if quantization[seq] is not None:
                    assert w_min >= -(2**(quantization[seq] - 1))
                    assert w_max < 2**(quantization[seq] - 1)
                else:
                    if w_max > 0:
                        w_max_m = int(w_max)
                    else:
                        w_max_m = int(abs(w_max)) - 1
                    if w_min > 0:
                        w_min_m = int(w_min)
                    else:
                        w_min_m = int(abs(w_min)) - 1
                    quantization[seq] = 1 << (
                        fls(max(fls(w_max_m), fls(w_min_m)) + 1) + 1)
                    assert quantization[seq] <= 8
                quant.append(quantization[seq])

                weight_min.append(w_min)
                weight_max.append(w_max)

                if op == 'conv2d' and operator[seq] == opn.CONVTRANSPOSE2D:
                    # For ConvTranspose2d, flip the weights as follows:
                    w = np.flip(w, axis=(2, 3)).swapaxes(0, 1)

                mult = conv_groups[
                    seq] if operator[seq] != opn.CONVTRANSPOSE2D else 1
                input_channels.append(w.shape[1] * mult)  # Input channels
                mult = conv_groups[seq] if operator[
                    seq] == opn.CONVTRANSPOSE2D else 1
                output_channels.append(w.shape[0] * mult)  # Output channels

                if len(w.shape) == 2:  # MLP
                    if kernel_size[seq][0] != 1 or kernel_size[seq][1] != 1:
                        eprint(
                            f'The `kernel_size` for the MLP layer {seq} should '
                            f'be set to 1x1 instead of '
                            f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.')
                        error_exit = True
                elif len(w.shape) == 3:  # 1D
                    if kernel_size[seq][0] != w.shape[2] or kernel_size[seq][
                            1] != 1:
                        eprint(
                            f'The `kernel_size` for the 1D layer {seq} should '
                            f'be set to {w.shape[2]}x1 instead of '
                            f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.')
                        error_exit = True
                elif len(w.shape) == 4:  # 2D
                    if kernel_size[seq][0] != w.shape[2] \
                       or kernel_size[seq][1] != w.shape[3]:
                        eprint(
                            f'The `kernel_size` for the 2D layer {seq} should '
                            f'be set to {w.shape[2]}x{w.shape[3]} instead of '
                            f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.')
                        error_exit = True

                w_count = np.prod(w.shape)
                param_count += w_count
                w_size = (w_count * quantization[seq] + 7) // 8
                weight_size.append(w_size)
                param_size += w_size

                if len(w.shape) == 2:  # linear - add dummy 'channel'
                    w = np.expand_dims(w, axis=0)
                else:  # conv1d, conv2d, ... - combine input and output channels
                    w = np.reshape(w, (-1, ) + w.shape[2:])

                weights.append(w)
                weight_keys.append(k)

                # Is there a bias for this layer?
                bias_name = operation + '.bias'

                if bias_name in checkpoint_state and seq not in no_bias:
                    w = checkpoint_state[bias_name].numpy(). \
                        astype(np.int64) // tornadocnn.dev.BIAS_DIV

                    w_min, w_max = w.min(), w.max()
                    assert w_min >= -(2**(bias_quantization[seq] - 1))
                    assert w_max < 2**(bias_quantization[seq] - 1)

                    bias_min.append(w_min)
                    bias_max.append(w_max)

                    bias.append(w)
                    bias_keys.append(bias_name)
                    bias_quant.append(bias_quantization[seq])
                    w_count = np.prod(w.shape)
                    param_count += w_count
                    w_size = (
                        w_count * 8 +
                        (bias_quantization[seq] - 1)) // bias_quantization[seq]
                    bias_size.append(w_size)
                    param_size += w_size
                else:
                    bias.append(None)
                    bias_min.append(0)
                    bias_max.append(0)
                    bias_keys.append('N/A')
                    bias_quant.append(0)
                    bias_size.append(0)

                # Not overriding output_shift?
                if output_shift[seq] is None:
                    output_shift_name = operation.rsplit(
                        sep='.', maxsplit=1)[0] + '.output_shift'
                    # Is there an output_shift for this layer?
                    if output_shift_name in checkpoint_state:
                        w = checkpoint_state[output_shift_name].numpy().astype(
                            np.int64)

                        assert len(w) == 1
                        output_shift[seq] = w[0]
                    else:
                        output_shift[seq] = 0

                # Add implicit shift based on quantization
                output_shift[seq] += 8 - quantization[seq]

                layers += 1
                seq += 1
            elif have_fc_layer:
                eprint(
                    'The network cannot have more than one fully connected software layer, '
                    'and it must be the output layer.')
                sys.exit(1)
            elif fc_layer:
                w = checkpoint_state[k].numpy().astype(np.int64)
                assert w.min() >= -128 and w.max() <= 127
                fc_weights.append(w)
                # Is there a bias for this layer?
                bias_name = operation + '.bias'
                if bias_name in checkpoint_state:
                    # Do not divide bias for FC
                    w = checkpoint_state[bias_name].numpy().astype(np.int64)
                    assert w.min() >= -128 and w.max() <= 127
                    fc_bias.append(w)
                else:
                    fc_bias.append(None)
                have_fc_layer = True

    if verbose:
        print(
            f'Checkpoint for epoch {checkpoint["epoch"]}, model {checkpoint["arch"]} - '
            'weight and bias data:')
        print(
            'Layer  InCh OutCh  Weights         Quant Shift  Min Max   Size '
            'Key                                 Bias       Quant  Min Max Size Key'
        )
        for ll in range(layers):
            if ll < len(weights) and weights[ll] is not None:
                weight_shape = str(weights[ll].shape)
                if bias[ll] is not None:
                    bias_shape = str(bias[ll].shape)
                else:
                    bias_shape = 'N/A'
                if output_shift[ll] is not None:
                    output_shift_shape = output_shift[ll]
                else:
                    output_shift_shape = 'N/A'
                print(
                    f'{ll:4}: '
                    f'{input_channels[ll]:5} {output_channels[ll]:5}  '
                    f'{weight_shape:15} '
                    f'{quant[ll]:5} {output_shift_shape:5} '
                    f'{weight_min[ll]:4} {weight_max[ll]:3} {weight_size[ll]:6} '
                    f'{weight_keys[ll]:35} '
                    f'{bias_shape:10} '
                    f'{bias_quant[ll]:5} {bias_min[ll]:4} {bias_max[ll]:3} {bias_size[ll]:4} '
                    f'{bias_keys[ll]:25}')
        print(
            f'TOTAL: {layers} layers, {param_count:,} parameters, {param_size:,} bytes'
        )

    if error_exit:
        sys.exit(1)

    return layers, weights, bias, output_shift, \
        fc_weights, fc_bias, input_channels, output_channels
Exemple #29
0
def pool2d(
        data,
        input_size,
        output_size,
        pool,
        stride,
        average,
        floor=True,
        debug=False,
):
    """
    Compute 2D Pooling (Average or Max)
    """
    assert data.shape == tuple(input_size)

    if debug:
        # Slow using pure Python
        ref = np.empty(shape=output_size, dtype=np.int64)

        for c in range(input_size[0]):
            for row in range(0, output_size[1]*stride[0], stride[0]):
                for col in range(0, output_size[2]*stride[1], stride[1]):
                    if average:
                        avg = np.average(data[c][row:row+pool[0], col:col+pool[1]])
                        if floor:
                            if avg < 0:
                                val = np.ceil(avg).astype(np.int64).clip(min=-128, max=127)
                            else:
                                val = np.floor(avg).astype(np.int64).clip(min=-128, max=127)
                        else:
                            val = np.floor(avg + 0.5).astype(np.int64).clip(min=-128, max=127)
                    else:
                        val = np.amax(data[c][row:row+pool[0], col:col+pool[1]])
                    ref[c][row//stride[0]][col//stride[1]] = val

    # Fast computation using NumPy
    data_pad = data[:, :(data.shape[1] - pool[0]) // stride[0] * stride[0] + pool[0],
                    :(data.shape[2] - pool[1]) // stride[1] * stride[1] + pool[1], ...]
    h, w = data_pad.strides[1:]

    view = as_strided(data_pad,
                      shape=(data_pad.shape[0],
                             1 + (data_pad.shape[1]-pool[0]) // stride[0],
                             1 + (data_pad.shape[2]-pool[1]) // stride[1],
                             pool[0], pool[1]),
                      strides=(data_pad.strides[0], stride[0] * h, stride[1] * w, h, w),
                      writeable=False)

    if average:
        if floor:
            pooled = np.nanmean(view, dtype=np.int64, axis=(3, 4))
        else:
            pooled = np.round(np.nanmean(view, axis=(3, 4))).astype(np.int64)
    else:
        pooled = np.nanmax(view, axis=(3, 4))

    if debug:
        match = (ref == pooled).all()
        if not match:
            eprint('NumPy <-> Python mismatch in compute.pool2d')
            sys.exit(1)

    assert pooled.shape == tuple(output_size)

    return pooled
Exemple #30
0
 def print_fptr_site_unmatched_msg():
     eprint(funct.src_filename + ':' + str(fptr_sites[j].src_line_num) +
            ':' + funct.asm_filename + ':' +
            str(fptr_sites[j].asm_line_num) +
            ': warning: no type for indirect call site in function '
            'named \'' + funct.asm_name + '\'')  # fix for C++
Exemple #31
0
def verify(
    verify_fn,
    ll,
    in_map,
    out_map,
    out_buf,
    processor_map,
    input_shape,
    out_offset,
    out_expand,
    out_expand_thresh,
    output_width=8,
    pool=None,
    pool_stride=None,
    overwrite_ok=False,
    no_error_stop=False,
    device=84,
    mlator=False,
    apb_base=0,
    stream=None,
    max_count=None,
    write_gap=0,
):
    """
    Verify HWC memory from AI8X, writing C or mem code using the `verify_fn` function.
    The generated code is specific to the network configuration passed in in `processor_map`,
    and `input_shape`. Additionally, the generated addresses are offset by
    `out_offset`. The function takes a pointer to a memory array, and the depth of
    the array does not matter (flattened or not flattened) as long as the size is correct.
    `in_map` and `out_map` are used to optionally prevent overwriting data
    (controlled by `overwrite_ok` and `no_error_stop`).
    When `mlator` is set, use the hardware mechanism to rearrange 4-channel data into single
    channels.
    """
    count = 0

    def check_overwrite(
        p,
        target_offs,
        in_map,
        out_map,
        c,
        row,
        col,
    ):
        # If using single layer, make sure we're not overwriting the input
        if (not overwrite_ok) and in_map[target_offs >> 2] is not None:
            old_ll, old_c, old_row, old_col, _ = in_map[target_offs >> 2]
            eprint(
                f'Processor {p}: '
                f'Layer {ll} output for CHW={c},{row},{col} is overwriting '
                f'input at offset 0x{target_offs:08x} that was created by '
                f'layer {old_ll}, CHW={old_c},{old_row},{old_col}.',
                error=not no_error_stop)
            if not no_error_stop:
                sys.exit(1)
        # Check we're not overflowing the data memory
        if (not overwrite_ok) and out_map is not None and out_map[
                target_offs >> 2] is not None:
            old_ll, old_c, old_row, old_col, old_val = out_map[target_offs
                                                               >> 2]
            eprint(
                f'Processor {p}: '
                f'Layer {ll} output for CHW={c},{row},{col} is overwriting '
                f'offset 0x{target_offs:08x}. Previous write by '
                f'layer {old_ll},CHW={old_c},{old_row},{old_col} with value 0x{old_val:08x}.',
                error=not no_error_stop)
            if not no_error_stop:
                sys.exit(1)

    # Start at the instance of the first active output processor/channel
    coffs_start = ffs(processor_map) & ~(tc.dev.P_SHARED - 1)
    next_layer_map = processor_map >> coffs_start
    # Output expansion for channels and/or wide output
    out_size = output_width // 8
    width = out_expand * out_size

    if not mlator or out_size > 1:
        if mlator:
            eprint('ignoring --mlator for 32-bit output', error=False)

        for doffs in range(input_shape[1] * input_shape[2]):
            row, col = divmod(doffs, input_shape[2])
            this_map = next_layer_map
            coffs = coffs_start
            poffs = coffs_start
            c = 0
            while c < input_shape[0]:
                if c % out_expand_thresh == 0:
                    poffs = coffs_start
                    this_map = next_layer_map  # Wrap around for AI85 channel expansion

                this_c = c
                expand = c // out_expand_thresh  # Channels 64+ handled by processors 0+
                # Physical offset into instance and group
                proc = poffs & ~(tc.dev.P_SHARED - 1)

                # Get four bytes or words either from output or zeros and construct HWC word
                no_data = True
                if out_size == 1:
                    val = 0
                    for _ in range(4):
                        val >>= 8
                        if this_map & 1:
                            no_data = False
                            if c < input_shape[0]:
                                val |= (out_buf[c][row][col] & 0xff) << 24
                            c += 1
                        this_map >>= 1
                else:
                    val = [0] * 4
                    for i in range(4):
                        if this_map & 1:
                            no_data = False
                            if c < input_shape[0]:
                                val[i] = out_buf[c][row][col] & 0xffffffff
                            c += 1
                        this_map >>= 1

                # Get the offset of the first output byte/word of 4
                offs = tc.dev.C_SRAM_BASE + out_offset - (write_gap << 2) + \
                    (((proc % tc.dev.P_NUMPRO) * tc.dev.INSTANCE_SIZE |
                      (proc // tc.dev.P_NUMPRO) * tc.dev.C_GROUP_OFFS // 4) +
                     (doffs * (write_gap + 1)) * width + expand * out_size) * 4

                # Special adjustment for AI84 quirk
                if device == 84 and pool and pool[0] == 4 and pool_stride[
                        0] == 4:
                    offs += (doffs // 4) * 8 + 8

                if not no_data:
                    num_bytes = min(c - this_c, input_shape[0] - this_c)
                    if out_size == 1:
                        check_overwrite(
                            proc,
                            offs,
                            in_map,
                            out_map,
                            this_c,
                            row,
                            col,
                        )
                        if out_map is not None:
                            out_map[offs >> 2] = (ll, this_c, row, col, val)
                        if max_count is None or count < max_count:
                            verify_fn(
                                offs,
                                val,
                                rv=False,
                                comment=
                                f' // {row},{col},{this_c}-{this_c+num_bytes-1}',
                                num_bytes=num_bytes,
                                first_proc=ffs(next_layer_map >> proc) % 4,
                            )
                    else:
                        for i in range(min(num_bytes, out_size)):
                            check_overwrite(
                                proc,
                                offs,
                                in_map,
                                out_map,
                                this_c,
                                row,
                                col,
                            )
                            if out_map is not None:
                                out_map[offs >> 2] = (ll, this_c, row, col,
                                                      val[i])
                            if max_count is None or count < max_count:
                                verify_fn(
                                    offs,
                                    val[i],
                                    rv=False,
                                    comment=f' // {row},{col},{this_c+i}',
                                )
                            offs += out_size
                    count += 1
                    if count == max_count:
                        stream.write('  // Truncated further checks...\n')

                coffs += 4
                poffs += 4
    else:  # mlator == True
        assert out_size == 1
        c = 0
        poffs = coffs_start
        this_map = next_layer_map
        read_addr = None

        while c < input_shape[0]:
            if c % out_expand_thresh == 0:
                poffs = coffs_start  # Wrap around for AI85 channel expansion
                this_map = next_layer_map

            expand = c // out_expand_thresh  # Channels 64+ handled by processors 0+
            # Physical offset into instance and group
            proc = poffs & ~(tc.dev.P_SHARED - 1)

            addr = tc.dev.C_CNN_BASE + (proc //
                                        tc.dev.P_NUMPRO) * tc.dev.C_GROUP_OFFS
            mlat = addr + tc.dev.REG_MLAT * 4
            ctrl = addr + tc.dev.REG_CTL * 4

            for shift in range(4):
                if this_map & 1:
                    for doffs in range(0, input_shape[1] * input_shape[2], 4):
                        row, col = divmod(doffs, input_shape[2])

                        # Get four bytes or words either from output or zeros and
                        # construct HWC word
                        val = 0
                        for i in range(4):
                            val >>= 8
                            if col + i < input_shape[2]:
                                val |= (out_buf[c][row][col + i] & 0xff) << 24

                        # Get the offset of the first output byte/word of 4
                        source = out_offset + \
                            (((proc % tc.dev.P_NUMPRO) * tc.dev.INSTANCE_SIZE |
                              (proc // tc.dev.P_NUMPRO) * tc.dev.C_GROUP_OFFS // 4) +
                             (doffs >> 2) * width) * 4

                        if source != read_addr:
                            if doffs != 0:
                                stream.write(
                                    f'  *((volatile uint32_t *) '
                                    f'0x{apb_base + ctrl:08x}) = '
                                    f'0x{tc.dev.READY_SEL << 1 | 1 << 3:08x}; '
                                    '// Disable mlator\n')
                            # Set wptr to start address
                            w = apb_base + addr + tc.dev.C_CNN*4 \
                                + tc.dev.LREG_WPTR_BASE*4 * tc.dev.MAX_LAYERS
                            stream.write(
                                f'  *((volatile uint32_t *) 0x{w:08x}) = '
                                f'0x{source >> 2:08x}; // Set SRAM address\n')
                            # Set wptr_inc to set increment value (default: 1)
                            w = apb_base + addr + tc.dev.C_CNN*4 \
                                + tc.dev.LREG_LCTL2*4 * tc.dev.MAX_LAYERS
                            stream.write(
                                f'  *((volatile uint32_t *) 0x{w:08x}) = '
                                f'0x{expand:08x}; // Set pointer increment\n')
                            # Set mlatorld enable bit to load write ptr; select byte 0..3
                            w = tc.dev.READY_SEL << 1 | 1 << 16 | shift << 17 | 1 << 3
                            stream.write(
                                f'  *((volatile uint32_t *) 0x{apb_base + ctrl:08x}) ='
                                f' 0x{w:08x}; '
                                f'// Enable mlator, byte {shift}\n')
                            stream.write(
                                '  asm volatile ("" : "=m" (*((volatile uint32_t *) '
                                f'0x{apb_base + mlat:08x})) : "r" '
                                f'(*((volatile uint32_t *) 0x{apb_base + mlat:08x})));'
                                ' // Prime\n')

                        num_bytes = min(4, input_shape[2] - col)
                        check_overwrite(
                            proc,
                            tc.dev.C_SRAM_BASE + source,
                            in_map,
                            out_map,
                            c,
                            row,
                            col,
                        )
                        if out_map is not None:
                            out_map[source >> 2] = (ll, c, row, col, val)
                        verify_fn(
                            mlat,
                            val,
                            rv=False,
                            comment=f' // {row},{col}-{col+num_bytes-1},{c}',
                            num_bytes=num_bytes,
                        )

                        read_addr = source + 4
                    # Disable mlator
                    stream.write(f'  *((volatile uint32_t *) '
                                 f'0x{apb_base + ctrl:08x}) = '
                                 f'0x{tc.dev.READY_SEL << 1 | 1 << 3:08x}; '
                                 '// Disable mlator\n')

                this_map >>= 1
                c += 1

            poffs += 4