def add_kernel_data(ll, p, col_target, b): col = kern_offs[ll] + col_target if col >= tc.dev.mask_width(p): eprint( f'\nKernel memory exceeded in layer {ll}.' '\n\nKernel map so far:') print_map(layers, kernel_map, print_fn=eprint_noprefix) sys.exit(1) if kernels_used[p][ col] == 0: # Update kernel map assert kernel_map[p][col] == _INVALID_VALUE kernel_map[p][col] = ll assert kernels_used[p][col] <= 8 kernel_data[p][col][ 8 - kernels_used[p][col]] = b & 0xff kernels_used[p][col] += 1 if kernels_used[p][col] == 9: # Flush col_target += 1 # Write 1 return col_target
def check_overwrite( p, target_offs, in_map, out_map, c, row, col, ): # If using single layer, make sure we're not overwriting the input if (not overwrite_ok) and in_map[target_offs >> 2] is not None: old_ll, old_c, old_row, old_col, _ = in_map[target_offs >> 2] eprint( f'Processor {p}: ' f'Layer {ll} output for CHW={c},{row},{col} is overwriting ' f'input at offset 0x{target_offs:08x} that was created by ' f'layer {old_ll}, CHW={old_c},{old_row},{old_col}.', error=not no_error_stop) if not no_error_stop: sys.exit(1) # Check we're not overflowing the data memory if (not overwrite_ok) and out_map is not None and out_map[ target_offs >> 2] is not None: old_ll, old_c, old_row, old_col, old_val = out_map[target_offs >> 2] eprint( f'Processor {p}: ' f'Layer {ll} output for CHW={c},{row},{col} is overwriting ' f'offset 0x{target_offs:08x}. Previous write by ' f'layer {old_ll},CHW={old_c},{old_row},{old_col} with value 0x{old_val:08x}.', error=not no_error_stop) if not no_error_stop: sys.exit(1)
def construct_mapping(self, node, deep=False): if not isinstance(node, yaml.MappingNode): raise yaml.constructor.ConstructorError( None, None, "Expected a mapping node, but found %s" % node.id, node.start_mark) mapping = {} for key_node, value_node in node.value: key = self.construct_object(key_node, deep=deep) try: hash(key) except TypeError as exc: eprint(f'Found unacceptable key {exc} {key_node.start_mark} ' f'while constructing a mapping {node.start_mark}') sys.exit(1) # check for duplicate keys if key in mapping: eprint(f'Found duplicate key {key} ' f'while constructing a mapping{node.start_mark}') sys.exit(1) value = self.construct_object(value_node, deep=deep) mapping[key] = value return mapping
def error_exit(message, sequence): """ Print error message `message` for layer sequence `sequence` and exit. """ eprint( f'{message} (found in layer sequence {sequence} in YAML configuration).' ) sys.exit(1)
def check_filename(self): if self.filename[-2:] != '.s': eprint('error: non-assembly file passed:', self.filename) sys.exit(1) elif len(self.filename) >= len('.cdi.s') and \ self.filename[-6:] == '.cdi.s': eprint('error: cdi-assembly file passed:', self.filename) sys.exit(1)
def makeDate(datestring): d = datetime.utcnow() try: d = parser.parse(datestring) except ValueError: if verbose > 0: eprint("makeDate: %s is not a valid datetime" % datestring) return str(d.strftime("%Y-%m-%dT%H:%M:%S.%f"))
def check_overwrite( self, offs, ): """ Check whether we're overwriting location `offs`. """ if self.mem[offs >> 2]: eprint(f'Overwriting location {offs:08x}', error=not self.no_error_stop) if not self.no_error_stop: sys.exit(1)
def main(): init() while True: (rv, original) = capture.read() if (not rv): break fps.incrementFrames() # Rotate the image, if the camera is on its side if (camera_rotation): original = np.rot90(original, camera_rotation) # Downscale image to make findtheface() faster img = cv2.resize(original, frame_downscale) imgscale = float(original.shape[0]) / img.shape[0] # Find the face and eyes using the Haar cascade (face, eyes) = findtheface(img) # If both are found, center on the eyes and scale if (face is not None and eyes is not None): img = centerandscale(img, face[0], eyes[0]) # Crop to the proper aspect ratio img = crop(img) # Show the image (and frames per second) cv2.imshow(title, cv2.flip(img, 1)) if (fps.framecount % 10 == 0): eprint('%.2f fps' % fps.getFPS(), end='\r') fps.reset() # Show image and wait for a key c = chr(cv2.waitKey(1) & 0xFF) if (c == 'q' or c == '\x1b'): # q or ESC to quit break if (c == ' ' or c == 'p' or c == 's'): # Print a screenshot img = centerandscale(original, np.dot(face[0], imgscale), np.dot(eyes[0], imgscale)) img = crop(img) cv2.imwrite("passport.jpg", img) print("Wrote image to passport.jpg") capture.release() cv2.destroyAllWindows()
def convert_to_cdi(site, funct, asm_line, asm_dest, cfg, sled_id_faucet, dwarf_loc, options, functs): """Converts asm_line to cdi compliant code then writes it to asm_dest""" if site.group == site.CALL_SITE: convert_call_site(site, funct, asm_line, asm_dest, sled_id_faucet, dwarf_loc, options, functs) elif site.group == site.RETURN_SITE: convert_return_site(site, funct, asm_line, asm_dest, cfg, sled_id_faucet, dwarf_loc, options, functs) elif site.group == site.INDIR_JMP_SITE: convert_indir_jmp_site(site, funct, asm_line, asm_dest) elif site.group == site.PLT_SITE: register_file_lines(asm_line, funct.asm_filename) else: eprint('warning: site has invalid type: line ' + site.asm_line_num, 'in function named \'' + funct.asm_name + '\'')
def get_device(device, ): """ Change implementation configuration to match, depending on the `device` integer input value. """ part = devices.partnum(device) print('Configuring device:', part) if device == 84: d = DevAI84(part) elif device == 85: d = DevAI85(part) elif device == 87: d = DevAI85(part) # For now, no differences from AI85 else: eprint(f'Unknown device code `{device}`') sys.exit(1) return d
def relauncher_main(argv=None): if argv == None: argv = sys.argv if(verbose_enabled(argv)): print("Env.relauncher_main called with {}".format(str(argv))) #find gnucash-env, fail if we can't find it gnucash_env = find_prog("gnucash-env") if gnucash_env == None: eprint("Could not find gnucash-env! Is GnuCash correctly installed?") sys.exit(1) #need to pass the --no-relaunch flag to make sure we don't get stuck in an infinite loop #this will make choose_main run accregex_main instead of relauncher_main additional_args = shlex.split("python2 -maccregex --no-relaunch") new_argv = additional_args + argv relaunch(gnucash_env, new_argv)
def relauncher_main(argv=None): if argv == None: argv = sys.argv if (verbose_enabled(argv)): print("Env.relauncher_main called with {}".format(str(argv))) #find gnucash-env, fail if we can't find it gnucash_env = find_prog("gnucash-env") if gnucash_env == None: eprint("Could not find gnucash-env! Is GnuCash correctly installed?") sys.exit(1) #need to pass the --no-relaunch flag to make sure we don't get stuck in an infinite loop #this will make choose_main run accregex_main instead of relauncher_main additional_args = shlex.split("python2 -maccregex --no-relaunch") new_argv = additional_args + argv relaunch(gnucash_env, new_argv)
def build_ret_dicts(cfg): """Builds return dictionaries of all functions in the CFG Notice that when a given function is being examined, it is all the other functions' return dictionaries that are being built. After all, a function foo's return dictionary depends on which functions call foo """ arbitrary_ftype = funct_cfg.FunctionType.arbitrary beg_multiplicity = 1 for funct in cfg: call_dict = dict() for site in funct.sites: if site.group == site.CALL_SITE: for target in site.targets: increment_dict(call_dict, target.uniq_label, beg_multiplicity) for target_label, multiplicity in call_dict.iteritems(): try: cfg.funct(target_label).ret_dict[funct.uniq_label] = \ multiplicity except KeyError: eprint("warning: function cannot be found: " + target_label)
def play( *, media, verbose: Union[bool, int, float], novideo: bool = False, noaudio: bool = False, subtitles: bool = False, loop: bool = False, skip_ahead: Optional[float] = None, ban_clipboard: bool = False, fullscreen: bool = False, ): global QUIT QUIT = False global BAN BAN = False media = Path(media).absolute() ic(media.as_posix()) if check_for_banned_hash( media=media, verbose=verbose, ): return # assert 'sources' in media.parts try: chan = extract_chan( path=media, verbose=verbose, ) except ValueError: chan = None video = not novideo audio = not noaudio # todo if video: video = "auto" player = mpv.MPV( log_handler=logger, input_default_bindings=True, terminal=True, input_terminal=True, input_vo_keyboard=True, # script_opts='osc-layout=bottombar,osc-seekbarstyle=bar,osc-deadzonesize=0,osc-minmousemove=3', # script_opts='osc-layout=bottombar', osd_bar=False, scripts="/home/user/.config/mpv/osc_seek.lua", # osc=True, video=video, ) # self.m = mpv.MPV(vo='x11') # ic(get_current_virtural_terminal()) if not in_xorg(verbose=verbose): player.vo = "drm" player.gpu_context = "auto" else: player.vo = "gpu" player.hwdec = "vaapi" if fullscreen: player.fullscreen = True if loop: # player.loop_playlist = 'inf' player.loop_file = "inf" if subtitles: player.sub = "yes" else: player.sub = "no" # if skip_ahead: # player.start(skip_ahead) # https://github.com/jaseg/python-mpv/issues/122 # player.on_key_press('ESC')(player.quit) # player.on_key_press('ENTER')(lambda: player.playlist_next(mode='force')) @player.on_key_press("Alt+i") def my_alt_i_binding(): # ic('Alt+i works') media_ext = media.name.split(".")[-1] # ic(media_ext) # if media_ext: try: media_json_file = media.as_posix().replace("." + media_ext, ".info.json") ic(media_json_file) url = jsonparser(path=media_json_file, key="webpage_url") ic(url) except (UnicodeDecodeError, PermissionError): # ic(e) # nope, will print the binary that was not json url = None if url: put_clipboard( url, verbose=verbose, ) if os.getuid() == 0: os.system('su user -c "/home/user/bin/spider-iri 1" &') else: os.system("/home/user/bin/spider-iri 1 &") else: if os.getuid() == 0: os.system("su user -c \"/usr/bin/iridb import '{}'\"".format( media.as_posix())) else: os.system("/usr/bin/iridb import {}".format(media.as_posix())) ic("done with Alt+i routine") @player.on_key_press("Meta+i") def my_meta_i_binding(): ic("Meta+i works") @player.on_key_press("D") def my_D_binding(): ic("D works") os.system("mv -vi " + '"' + media.as_posix() + '"' + " /delme/") @player.on_key_press("B") def my_B_binding(): global BAN BAN = True ic("banning:", chan) # player.terminate() player.quit() # pillow_img = player.screenshot_raw() # pillow_img.save('screenshot.png') @player.on_key_press("L") def my_L_binding(): global PLAY_LATER PLAY_LATER = True ic("PLAY_LATER:", chan) player.quit() @player.on_key_press("Meta+L") def my_meta_L_binding(): global PLAY_LATER PLAY_LATER = True ic("PLAY_LATER:", chan) player.quit() # player.on_key_press('ENTER')(lambda: player.playlist_next(mode='force')) @player.on_key_press("ENTER") def my_enter_keybinding(): ic() player.playlist_next(mode="force") # ESC must be pressed 2x if the focus is on the terminal due to mpv design: # https://github.com/jaseg/python-mpv/issues/122 player.on_key_press("ESC")(player.quit) player.register_key_binding("INS", "seek 5") # @player.on_key_press('ESC') # def my_esc_binding(): # #player.quit() # global QUIT # QUIT = True # player.terminate() try: player.play(media.as_posix()) # https://github.com/jaseg/python-mpv/issues/79 if skip_ahead: player.wait_for_property("seekable") player.seek(skip_ahead, reference="absolute", precision="exact") player.wait_for_playback() except mpv.ShutdownError: eprint("\nmpv.ShutdownError\n") player.terminate() if BAN: if ban_clipboard: clipboard = get_clipboard( one_line=True, verbose=verbose, ) ic("raising BanClipboardError:", clipboard) raise BanClipboardError(clipboard) else: ic("raising BanChanError:", chan) raise BanChanError(chan) if PLAY_LATER: ic("raising PlayChanLaterError:", chan) raise PlayChanLaterError(chan) raise StopPlayingError # pass ic("calling player.terminate()") player.terminate()
def load( # pylint: disable=too-many-branches,too-many-statements verbose, embedded_code, device, apb, start_layer, layers, operator, kernel, kernel_size, quantization, processor_map, output_processor_map, input_chan, output_chan, out_expand, out_expand_thresh, in_expand, in_expand_thresh, flatten=False, mexpress=False, verify=False, riscv_flash=False, quad=False, debug=False, blocklevel=False, legacy_kernels=False, calcx4=False, ): """ Stack `kernel` values and write them to C code (for `embedded_code` if `True` or RTL simulation). The output is written to the `apb` object. Input is configured with `kernel_size`, `quantization`, `layers`, `processor_map`, `output_processor_map`, `input_chan`, `output_chan`, `out_expand` and `out_expand_thresh`. When `mexpress` is `True`, the function uses the memcpy()-friendly hardware functionality to reduce the number of transfers. When `verify` is also true (mexpress mode only), kernels are read back and compared. This function returns the kernel offsets and the kernel lengths for all layers. """ # Kernels: Stack kernels; write only the kernels needed proc_kern_max = [0] * tc.dev.MAX_PROC kern_offs = [0] * layers kern_len = [0] * layers kernel_map = np.full((tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE), _INVALID_VALUE, dtype=np.int64) kernels_used = np.zeros((tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE), dtype=np.int64) kernel_data = np.zeros((tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE, 9), dtype=np.int8) # There are four 32-bit words per 9-byte kernel. # The value map is initialized with zeros so we can later ignore unused entries and use # memcpy() on initialized and uninitialized data. kernel_values = np.zeros( (tc.dev.MAX_PROC, tc.dev.MASK_WIDTH_LARGE * _WORDS_PER_KERNEL), dtype=np.int64) if debug: print('\nLoading Kernels...') if calcx4 and not tc.dev.SUPPORT_CALCX4: eprint('--calcx4 is not supported on this device.') sys.exit(1) assert not ( (embedded_code or mexpress) and calcx4) # FIXME Add support later for ll in range(start_layer, layers): if operator[ll] not in [op.CONV1D, op.CONV2D, op.CONVTRANSPOSE2D]: kern_len[ll] = 0 kern_offs[ll] = 0 continue if flatten[ll]: kernel_reshaped = kernel[ll].reshape( output_chan[ll] * input_chan[ll], -1, kernel_size[ll][0], kernel_size[ll][1], ) else: kernel_reshaped = kernel[ll] first_proc = ffs(processor_map[ll]) last_proc = fls(processor_map[ll]) ch = 0 m = 0 for p in range(first_proc, last_proc + 1): if (processor_map[ll] >> p) & 1 == 0: # Unused processor continue # Get highest offset for all used processors kern_offs[ll] = max(proc_kern_max[p], kern_offs[ll]) ksize = kernel_size[ll][0] * kernel_size[ll][1] qfactor = 8 // quantization[ll] # Determine the number of kernels that need to be programmed. Since each instance # spans 4 processors, kernels for all instances that have a single processor enabled # need to be written, i.e. round down the first. The last does not need to be rounded # up because hardware takes care of it. next_layer_map = output_processor_map[ll] # When using kernels smaller than 8 bit, round up to the next 8-bit boundary # Gaps are accounted for like any other kernel. kern_len[ll] = 1 + quantization[ll] * \ (fls(next_layer_map) - ffs(next_layer_map)) // 8 # This extends the kernels to the right on AI85 for input and output expansion if output_chan[ll] > tc.dev.MAX_PROC: kern_len[ll] = (kern_len[ll] + tc.dev.P_SHARED - 1) & ~(tc.dev.P_SHARED - 1) kern_len[ll] *= out_expand[ll] * in_expand[ll] if not legacy_kernels and flatten[ll]: kern_len[ll] *= kernel_reshaped.shape[1] kern_len[ll] -= (out_expand[ll] * popcount(next_layer_map) - output_chan[ll]) \ * kernel_reshaped.shape[1] * 8 // (ksize * quantization[ll]) if device != 84: # Pack kernels when using 1D convolutions, or 1x1 kernels kern_len[ll] = (kern_len[ll] * ksize + 8) // 9 if ll == 0 and quad: kern_len[0] = (kern_len[0] + 3) // 4 # We don't have to use dummy columns if there's space available on the left kern_offs[ll] = \ max(0, kern_offs[ll] - (((ffs(next_layer_map) % tc.dev.P_SHARED) + qfactor - 1) // qfactor)) # The kernel offset needs to start at a multiple of 4. kern_offs[ll] = (kern_offs[ll] + tc.dev.P_SHARED - 1) & ~(tc.dev.P_SHARED - 1) if kern_offs[ll] + kern_len[ll] > tc.dev.mask_width(p): eprint( f'\nKernel memory exceeded at layer {ll}; offset: {kern_offs[ll]}, ' f'needed: {kern_len[ll]}.' '\n\nKernel map so far:') print_map(layers, kernel_map, print_fn=eprint_noprefix) sys.exit(1) proc_mask = 2**qfactor - 1 # Start at the first used instance this_map_init = next_layer_map >> ffs(next_layer_map) start_col = ffs( next_layer_map) % tc.dev.P_SHARED # First target column for p in range(first_proc, last_proc + 1): if (processor_map[ll] >> p) & 1 == 0: # Unused source processor continue col_target = start_col for expand in range(out_expand[ll]): this_map = this_map_init if ll == 0 and quad: col = expand * (out_expand_thresh[ll] + 3) // 4 stop_col = col + (out_expand_thresh[ll] + 3) // 4 else: col = expand * out_expand_thresh[ll] stop_col = col + out_expand_thresh[ll] while col < stop_col: # Skip over unused bits in the target processor map # (unused means 1 bit for 8-bit weights, 2 for 4-bit weights, etc.) if this_map != 0: while this_map & proc_mask == 0: assert this_map != 0 col_target += 1 # Completely skip this_map >>= qfactor # and slide forward this_mask = this_map & proc_mask this_map >>= qfactor if ll == 0 and quad: src_offs = ch + (m - p // 16) * input_chan[ll] else: src_offs = ch + m * input_chan[ll] if ll > 0 or not quad or (m % 4 == p // 16): for ie in range(in_expand[ll]): mask = this_mask def add_kernel_data(ll, p, col_target, b): col = kern_offs[ll] + col_target if col >= tc.dev.mask_width(p): eprint( f'\nKernel memory exceeded in layer {ll}.' '\n\nKernel map so far:') print_map(layers, kernel_map, print_fn=eprint_noprefix) sys.exit(1) if kernels_used[p][ col] == 0: # Update kernel map assert kernel_map[p][col] == _INVALID_VALUE kernel_map[p][col] = ll assert kernels_used[p][col] <= 8 kernel_data[p][col][ 8 - kernels_used[p][col]] = b & 0xff kernels_used[p][col] += 1 if kernels_used[p][col] == 9: # Flush col_target += 1 # Write 1 return col_target n = 0 if src_offs < len(kernel_reshaped): if not flatten[ll]: k = np.zeros_like( kernel_reshaped[src_offs].flatten()) for i in range(qfactor): if m < output_chan[ll]: # Cycle through phases idx = n + ie * qfactor koffs = src_offs + (idx % in_expand[ll]) \ * in_expand_thresh[ll] \ + (idx // in_expand[ll]) \ * input_chan[ll] if koffs < len(kernel_reshaped): this_kern = kernel_reshaped[koffs].flatten() \ & (2**quantization[ll]-1) k |= this_kern << ( i * quantization[ll]) n += 1 mask >>= 1 else: kl = (len(kernel_reshaped[src_offs]) + qfactor - 1) // qfactor k = np.zeros(kl, dtype=np.int64) if m < output_chan[ll]: # Cycle through phases idx = n + ie * qfactor koffs = src_offs + (idx % in_expand[ll]) \ * in_expand_thresh[ll] \ + (idx // in_expand[ll]) \ * input_chan[ll] if koffs < len(kernel_reshaped): this_kern = kernel_reshaped[ koffs].flatten() if len(this_kern) % qfactor != 0: this_kern = np.append( this_kern, np.zeros(qfactor - len(this_kern) % qfactor, dtype=np.int64)) for i in range(qfactor): k |= ((this_kern[i::qfactor] & (2**quantization[ll]-1))) \ << (i * quantization[ll]) n += 1 mask >>= 1 if debug: with np.printoptions( formatter={ 'int': '{0:02x}'.format }): print( f'Layer {ll} processor {p} channel ' f'{ch + ie * in_expand_thresh[ll]} m[{m}..{m+n-1}] ' f'of {output_chan[ll]}: {k}') if flatten[ll]: for _, e in enumerate(k): col_target = add_kernel_data( ll, p, col_target, e) else: for i in range(ksize): col_target = add_kernel_data( ll, p, col_target, k[ksize - i - 1]) else: # When expanding, need to pad with zero kernels if needed for _ in range(ksize // qfactor): col_target = add_kernel_data( ll, p, col_target, 0) # Consume kernels if not flatten[ll]: col += qfactor m += qfactor else: col += 1 m += 1 else: m += qfactor if kern_offs[ll] + col_target < tc.dev.mask_width(p) \ and kernels_used[p][kern_offs[ll] + col_target] > 0: # Partials col_target += 1 while col_target - start_col < kern_len[ll]: col_target = add_kernel_data(ll, p, col_target, 0) if flatten[ll]: kern_len[ll] = col_target else: assert kern_len[ll] == col_target - start_col proc_kern_max[p] = kern_offs[ll] + kern_len[ll] ch += 1 m = 0 if verbose: print('\nKernel map:') print_map(layers, kernel_map) if verify or not (embedded_code or mexpress): if verify: apb.output('int verify_kernels(void)\n{\n') # Write in-line for p in range(tc.dev.MAX_PROC): for col in range(0, tc.dev.mask_width(p)): ll = kernel_map[p][col] if ll != _INVALID_VALUE: k = kernel_data[p][col] apb.write_kern(ll, p, col, k, verify_only=verify, calcx4=calcx4) if verify: apb.output(' return 1;\n}\n\n') if embedded_code or mexpress: # Write kernels, combining layers and processors where possible to reduce the number # of constants and calls to memcpy. apb.output('// Kernels:\n') if not mexpress: for p in range(tc.dev.MAX_PROC): for col in range(0, tc.dev.mask_width(p)): ll = kernel_map[p][col] if ll != _INVALID_VALUE: k = kernel_data[p][col] offs = _WORDS_PER_KERNEL * col kernel_values[p][offs] = k[0] & 0xff kernel_values[p][offs + 1] = (k[1] & 0xff) << 24 \ | (k[2] & 0xff) << 16 | (k[3] & 0xff) << 8 | k[4] & 0xff kernel_values[p][offs + 2] = (k[5] & 0xff) << 24 \ | (k[6] & 0xff) << 16 | (k[7] & 0xff) << 8 | k[8] & 0xff # First, define the weights (will move to header file) # Combining memcopy() requires stacked memories max_col = [-1] * tc.dev.MAX_PROC min_col = [tc.dev.MASK_WIDTH_LARGE if not legacy_kernels else 0 ] * tc.dev.MAX_PROC for p in range(0, tc.dev.MAX_PROC): for col in range(0, tc.dev.mask_width(p)): ll = kernel_map[p][col] if ll != _INVALID_VALUE: max_col[p] = col min_col[p] = min(min_col[p], col) p = 0 while p < tc.dev.MAX_PROC: if max_col[p] >= 0: start = p while (max_col[p] == tc.dev.MASK_OFFS and p + 1 < tc.dev.MAX_PROC and max_col[p + 1] >= 0 and min_col[p + 1] == 0 and (start & ~(tc.dev.P_NUMPRO - 1)) == (p + 1 & ~(tc.dev.P_NUMPRO - 1))): p += 1 # Combine multiple channels into one define k = None for i in range(start, p + 1): if k is None: k = kernel_values[i][min_col[i] * _WORDS_PER_KERNEL: (max_col[i] + 1) * _WORDS_PER_KERNEL] else: k = np.concatenate( (k, kernel_values[i] [min_col[i] * _WORDS_PER_KERNEL:(max_col[i] + 1) * _WORDS_PER_KERNEL])) apb.output_define(k, f'KERNELS_{start}', '0x%08x', 8) p += 1 # Second, initialize static const variables as source for memcpy p = 0 while p < tc.dev.MAX_PROC: if max_col[p] >= 0: span = max_col[p] + 1 - min_col[p] start = p while (max_col[p] == tc.dev.MASK_OFFS and p + 1 < tc.dev.MAX_PROC and max_col[p + 1] >= 0 and min_col[p + 1] == 0 and (start & ~(tc.dev.P_NUMPRO - 1)) == (p + 1 & ~(tc.dev.P_NUMPRO - 1))): p += 1 span += max_col[p] + 1 - min_col[p] if riscv_flash: apb.output(rv.RISCV_FLASH) apb.output( f'static const uint32_t kernels_{start}[] = KERNELS_{start};\n' ) p += 1 apb.output('\n') # Generate code to load the weights using memcpy apb.output( 'void memcpy_96to128(uint32_t *dst, const uint32_t *src, int n)\n{\n' ) apb.output(' while (n-- > 0) {\n' ' *dst++ = *src++;\n' ' *dst++ = *src++;\n' ' *dst++ = *src++;\n' ' *dst++ = 0; // Execute write\n' ' }\n}\n\n') else: # When using the express loader, gather all consecutive kernels for each processor # and pack them. zero_kernel = np.array([0] * 9, dtype=np.uint8) k = None for p in range(tc.dev.MAX_PROC): # Find min/max from kernel_map max_col = -1 min_col = tc.dev.mask_width(p) if not legacy_kernels else 0 for col in range(0, tc.dev.mask_width(p)): ll = kernel_map[p][col] if ll != _INVALID_VALUE: max_col = col min_col = min(min_col, col) if max_col >= 0: for col in range(min_col, max_col + 1): ll = kernel_map[p][col] if ll != _INVALID_VALUE: new_k = (kernel_data[p][col] & 0xff).astype( np.uint8) else: new_k = zero_kernel if k is None: k = new_k else: k = np.concatenate((k, new_k)) # Round up to multiple of 4 if len(k) % 4 != 0: k = np.concatenate((k, zero_kernel[:4 - len(k) % 4])) # '>u4' swaps endianness to what the hardware needs, `view` packs into 32-bit if not blocklevel: apb.output_define(k.view(dtype='>u4'), f'KERNELS_{p}', '0x%08x', 8) else: addr = tc.dev.C_GROUP_OFFS * (p // tc.dev.P_NUMPRO) \ + tc.dev.C_MRAM_BASE + (p % tc.dev.P_NUMPRO) * tc.dev.MASK_OFFS * 16 apb.write(addr + min_col * 4 | 0x01, 0x01) kb = k.view(dtype=">u4") for _, e in enumerate(kb): apb.write(addr, e) addr += 4 if riscv_flash: apb.output(rv.RISCV_FLASH) apb.output( f'static const uint32_t kernels_{p}[] = KERNELS_{p};\n' ) k = None apb.output('\n') if not blocklevel: apb.output('void load_kernels(void)\n{\n') max_col = [-1] * tc.dev.MAX_PROC min_col = [tc.dev.MASK_WIDTH_LARGE if not legacy_kernels else 0 ] * tc.dev.MAX_PROC for p in range(0, tc.dev.MAX_PROC): for col in range(0, tc.dev.mask_width(p)): ll = kernel_map[p][col] if ll != _INVALID_VALUE: max_col[p] = col min_col[p] = min(min_col[p], col) p = 0 while p < tc.dev.MAX_PROC: if max_col[p] >= 0: span = max_col[p] + 1 - min_col[p] start = p addr = apb.apb_base + tc.dev.C_GROUP_OFFS * (p // tc.dev.P_NUMPRO) \ + tc.dev.C_MRAM_BASE + (p % tc.dev.P_NUMPRO) * tc.dev.MASK_OFFS * 16 while (max_col[p] == tc.dev.MASK_OFFS and p + 1 < tc.dev.MAX_PROC and max_col[p + 1] >= 0 and min_col[p + 1] == 0 and (start & ~(tc.dev.P_NUMPRO - 1)) == (p + 1 & ~(tc.dev.P_NUMPRO - 1))): p += 1 span += max_col[p] + 1 - min_col[p] assert addr % 16 == 0 if not mexpress: apb.output(' memcpy_96to128((uint32_t *)' f' 0x{addr + min_col[start] * 16:08x},' f' kernels_{start}, {span});\n') else: apb.output( ' *((volatile uint8_t *)' f' 0x{addr + min_col[start] * 4 | 0x01:08x}) = 0x01; ' '// Set address\n') apb.output( f' memcpy32((uint32_t *) 0x{addr:08x}, ' f'kernels_{start}, {(span * 9 + 3) // 4});\n') p += 1 apb.output('}\n\n') return kern_offs, kern_len
def main( memfile, classification_layer=False, unload=False, softmax=False, embedded_code=False, oneshot=0, stopstart=False, riscv=None, riscv_exclusive=False, riscv_flash=False, # pylint: disable=unused-argument riscv_cache=False, riscv_debug=False, riscv_debugwait=True, camera=False, camera_format=None, device=84, channels=None, sleep=False, output_width=8, num_classes=None, clock_trim=None, embedded_arm=False, groups=None, boost=None, forever=False, fifo=False, mexpress=False, ): """ Write the main function (including an optional call to the fully connected layer if `classification_layer` is `True`) to `memfile`. """ assert groups is not None mask = 0 for _, group in enumerate(groups): mask |= 1 << group unmask = ~mask & ((1 << tc.dev.P_NUMGROUPS) - 1) if softmax and output_width == 8: eprint('--softmax should only be used with `output_width: 32`', error=False) if unload: memfile.write(f'#define NUM_OUTPUTS {num_classes}\n') memfile.write(f'static int{output_width}_t ml_data[NUM_OUTPUTS];\n\n') memfile.write('int main(void)\n{\n') if clock_trim is not None and not riscv: memfile.write(' uint32_t trim;\n') if embedded_code and (classification_layer or softmax) or oneshot > 0: memfile.write(' int i;\n') if embedded_arm and riscv_debugwait: memfile.write(' int i;\n') if embedded_code and (classification_layer or softmax): memfile.write(' int digs, tens;\n') if riscv is None or not riscv: if embedded_code or embedded_arm: if device == 84: memfile.write(' icache_enable();\n\n') memfile.write(' SYS_ClockEnable(SYS_PERIPH_CLOCK_AI);\n') else: memfile.write( '\n MXC_ICC_Enable(MXC_ICC0); // Enable cache\n\n') if clock_trim is not None: memfile.write(' // Manual clock trim override:\n') memfile.write(' *((volatile uint32_t *) 0x40000c00) = 1; ' '// Set TME\n') if clock_trim[0] or clock_trim[1]: memfile.write( ' trim = *((volatile uint32_t *) 0x40005420);\n') if clock_trim[0]: memfile.write(' trim &= ~0xffff;\n' f' trim |= 0x{clock_trim[0]:x}; ' '// HIRC8M (7.3728 MHz) trim\n') if clock_trim[1]: memfile.write( ' trim &= ~(0x1ff << 22);\n' f' trim |= 0x{clock_trim[1]:x} << 22; ' '// HIRC (60 MHz) trim\n') memfile.write( ' *((volatile uint32_t *) 0x40005420) = trim;\n') if clock_trim[2]: memfile.write( ' trim = *((volatile uint32_t *) 0x40005440) & ' '~(0x1ff << 15);\n') memfile.write( ' *((volatile uint32_t *) 0x40005440) = ' 'trim | (0xff << 15); // HILIM\n') memfile.write( ' *((volatile uint32_t *) 0x40006c04) = ' f'0x{clock_trim[2]:x}; // HIRC96M (100 MHz) trim\n' ) memfile.write(' *((volatile uint32_t *) 0x40000c00) = 0; ' '// Clear TME\n\n') memfile.write(' // Switch to 100 MHz clock\n') memfile.write(' MXC_SYS_Clock_Select(MXC_SYS_CLOCK_IPO);\n') memfile.write(' SystemCoreClockUpdate();\n') memfile.write( '\n // Reset all domains, restore power to CNN\n') memfile.write(' MXC_BBFC->reg3 = 0xf; // Reset\n') memfile.write( f' MXC_BBFC->reg1 = 0x{mask:01x}; // Mask memory\n') memfile.write(f' MXC_BBFC->reg0 = 0x{mask:01x}; // Power\n') memfile.write(f' MXC_BBFC->reg2 = 0x{unmask:01x}; // Iso\n') memfile.write(' MXC_BBFC->reg3 = 0x0; // Reset\n\n') memfile.write( ' MXC_GCR->pclkdiv &= ~(MXC_F_GCR_PCLKDIV_CNNCLKDIV | ' 'MXC_F_GCR_PCLKDIV_CNNCLKSEL);\n' ' MXC_GCR->pclkdiv |= MXC_S_GCR_PCLKDIV_CNNCLKDIV_DIV1; ' '// CNN clock: 100 MHz div 2\n') memfile.write( ' MXC_SYS_ClockEnable(MXC_SYS_PERIPH_CLOCK_CNN); ' '// Enable CNN clock\n') if boost is not None: memfile.write(f'\n // Configure P{boost[0]}.{boost[1]}, ' 'turn on the CNN Boost\n') memfile.write(' mxc_gpio_cfg_t gpio_out;\n') memfile.write(f' gpio_out.port = MXC_GPIO{boost[0]};\n') memfile.write( f' gpio_out.mask = MXC_GPIO_PIN_{boost[1]};\n') memfile.write(' gpio_out.pad = MXC_GPIO_PAD_NONE;\n') memfile.write(' gpio_out.func = MXC_GPIO_FUNC_OUT;\n') memfile.write(' MXC_GPIO_Config(&gpio_out);\n') memfile.write( ' MXC_GPIO_OutSet(gpio_out.port, gpio_out.mask);\n') else: memfile.write(' icache_enable();\n\n') if device == 84: memfile.write( ' MXC_GCR->perckcn1 &= ~0x20; // Enable CNN clock\n') else: memfile.write( ' *((volatile uint32_t *) 0x40000c00) = 0x00000001; // Set TME\n' ) memfile.write( ' *((volatile uint32_t *) 0x40006c04) = 0x000001a0; // 96M trim\n' ) memfile.write( ' *((volatile uint32_t *) 0x40000c00) = 0x00000000; ' '// Clear TME\n\n') memfile.write( ' MXC_GCR->clkcn |= MXC_F_GCR_CLKCN_HIRC96M_EN; // Enable 96M\n' ) memfile.write( ' while ((MXC_GCR->clkcn & MXC_F_GCR_CLKCN_HIRC96M_RDY) == 0) ; ' '// Wait for 96M\n') memfile.write( ' MXC_GCR->clkcn |= MXC_S_GCR_CLKCN_CLKSEL_HIRC96; // Select 96M\n' ) memfile.write( '\n // Reset all domains, restore power to CNN\n') memfile.write(' MXC_BBFC->reg3 = 0xf; // Reset\n') memfile.write( f' MXC_BBFC->reg1 = 0x{mask:01x}; // Mask memory\n') memfile.write(f' MXC_BBFC->reg0 = 0x{mask:01x}; // Power\n') memfile.write(f' MXC_BBFC->reg2 = 0x{unmask:01x}; // Iso\n') memfile.write(' MXC_BBFC->reg3 = 0x0; // Reset\n\n') memfile.write( ' MXC_GCR->pckdiv = 0x00010000; // CNN clock 96M div 2\n') memfile.write( ' MXC_GCR->perckcn &= ~0x2000000; // Enable CNN clock\n') if riscv is not None: if riscv_cache: if embedded_code or embedded_arm: memfile.write( '\n MXC_FCR->urvbootaddr = (uint32_t) &__FlashStart_;' '// Set RISC-V boot address\n') else: memfile.write( f' MXC_NBBFC->reg4 = 0x{rv.RISCV_CODE_ORIGIN:08x}; ' '// Set RISC-V boot address\n') if riscv_exclusive: if embedded_code or embedded_arm: memfile.write(' MXC_FCR->urvctrl |= 0x00000001; ' '// Exclusive SRAM access for RISC-V\n') else: memfile.write( ' *((volatile uint32_t *) 0x40000814) |= 0x00000001; ' '// Exclusive SRAM access for RISC-V (MXC_NBBFC->reg5)\n' ) if embedded_code or embedded_arm: memfile.write( ' MXC_GCR->pclkdis1 &= ~MXC_F_GCR_PCLKDIS1_CPU1; ' '// Enable RISC-V clock\n') else: memfile.write( ' MXC_GCR->perckcn1 &= ~MXC_F_GCR_PERCKCN1_CPU1; ' '// Enable RISC-V clock\n') memfile.write('\n') elif riscv: if riscv_debug and embedded_code: memfile.write(' Debug_Init(); // Set up RISCV JTAG\n') if riscv_cache: if not embedded_code: memfile.write(' icache1_enable();\n') memfile.write(' invalidate_icache1();\n\n') else: memfile.write( ' MXC_ICC_Enable(MXC_ICC1); // Enable cache\n\n') if camera: memfile.write(' enable_pcif_clock(); // Enable camera clock\n') memfile.write(' set_pcif_gpio_altf();\n\n') if camera_format == 555: mode = '10' comment = '555' elif camera_format == 565: mode = '12' comment = '565' else: mode = '8' # Default comment = '888' memfile.write( f' // Enable {comment} format single image in external timing mode\n' ) memfile.write( ' MXC_CAMERAIF0->ctrl = MXC_S_CAMERAIF_CTRL_READ_MODE_SINGLE_IMG +\n' f' MXC_S_CAMERAIF_CTRL_DATA_WIDTH_{mode}BIT +\n' ' MXC_S_CAMERAIF_CTRL_DS_TIMING_EN_DIS +\n' ' MXC_S_CAMERAIF_CTRL_PCIF_SYS_EN_EN') if channels == 3: memfile.write(' +\n (1<<30);\n\n') else: memfile.write(';\n\n') if riscv is None or riscv: if embedded_code: memfile.write(' printf("\\n*** CNN Test ***\\n");\n\n') if embedded_code: memfile.write(' if (!cnn_load()) fail();\n') memfile.write(' MXC_TMR_SW_Start(MXC_TMR0);\n') else: memfile.write(' if (!cnn_load()) { fail(); pass(); return 0; }\n') if stopstart: memfile.write('\n cnn_stop();\n') memfile.write(' cnn_restart();\n\n') memfile.write(' cnn_wait();\n\n') if oneshot > 0: memfile.write(f' for (i = 0; i < {oneshot}; i++) {{\n') memfile.write(' cnn_restart();\n') memfile.write(' cnn_wait();\n') memfile.write(' }\n\n') if not forever and boost is not None: memfile.write(' // Turn off the CNN Boost\n') memfile.write( ' MXC_GPIO_OutClr(gpio_out.port, gpio_out.mask);\n\n') memfile.write(' if (!cnn_check()) fail();\n') if classification_layer or softmax: memfile.write( f' if (!{"softmax" if softmax else "fc"}_layer()) fail();\n') elif unload: memfile.write(f' cnn_unload((uint{output_width}_t *) ml_data);\n') if classification_layer: memfile.write(' if (!fc_verify()) fail();\n') if embedded_code: memfile.write('\n printf("\\n*** PASS ***\\n\\n");\n\n') memfile.write( ' printf("Time for CNN: %d us\\n\\n", cnn_time);\n\n') if not forever: memfile.write(' // Disable power to CNN\n') memfile.write(' MXC_BBFC->reg3 = 0xf; // Reset\n') memfile.write(' MXC_BBFC->reg1 = 0x0; // Mask memory\n') memfile.write(' MXC_BBFC->reg0 = 0x0; // Power\n') memfile.write(' MXC_BBFC->reg2 = 0xf; // Iso\n') memfile.write(' MXC_BBFC->reg3 = 0x0; // Reset\n\n') if not forever: if classification_layer or softmax: memfile.write( ' printf("Classification results:\\n");\n' ' for (i = 0; i < NUM_OUTPUTS; i++) {\n' ' digs = (1000 * ml_softmax[i] + 0x4000) >> 15;\n' ' tens = digs % 10;\n' ' digs = digs / 10;\n' ' printf("[%7d] -> Class %d: %d.%d%%\\n", ' f'{"fc_output" if classification_layer else "ml_data"}[i], ' 'i, digs, tens);\n' ' }\n\n') else: memfile.write( ' printf("Starting endless loop...\\n");\n\n LED_On(1);\n\n') memfile.write(' while(1) {\n') gval = tc.dev.READY_SEL << 1 if fifo: gval |= 1 << 15 if device != 84: gval |= 1 << 3 # Enable clocks if mexpress: gval |= 1 << 20 for _, group in enumerate(groups): addr = tc.dev.APB_BASE + tc.dev.C_GROUP_OFFS*group + tc.dev.C_CNN_BASE \ + tc.dev.REG_CTL*4 memfile.write( f' *((volatile uint32_t *) 0x{addr:08x}) = 0x{gval:08x}; ' '// Stop SM\n') for _, group in enumerate(groups): val = gval | 0x800 if group > 0: val |= 0x01 addr = tc.dev.APB_BASE + tc.dev.C_GROUP_OFFS*group + tc.dev.C_CNN_BASE \ + tc.dev.REG_CTL*4 memfile.write( f' *((volatile uint32_t *) 0x{addr:08x}) = 0x{val:08x}; ' f'// Enable group {group}\n') addr = tc.dev.APB_BASE + tc.dev.C_CNN_BASE \ + tc.dev.REG_CTL*4 memfile.write( f' *((volatile uint32_t *) 0x{addr:08x}) = 0x{gval | 0x01:08x}; ' '// Master enable group 0\n') memfile.write(f' while ((*((volatile uint32_t *) ' f'0x{tc.dev.APB_BASE + tc.dev.C_CNN_BASE:08x}) ' '& (1<<12)) != 1<<12) ;\n') memfile.write(' }\n') if riscv is not None and not riscv: if sleep: memfile.write( ' SCB->SCR |= SCB_SCR_SLEEPDEEP_Msk; // SLEEPDEEP=1\n') if embedded_arm: if riscv_debugwait: memfile.write(' for (i = 0; i < (1 << 27); i++); ' '// Let debugger interrupt if needed\n') memfile.write(' __WFI(); // Let RISC-V run\n') else: memfile.write(' asm volatile("wfi"); // Let RISC-V run\n') if not embedded_code and not embedded_arm: memfile.write(' pass();\n') memfile.write(' return 0;\n}\n\n')
def conv2d( data, weight, bias, input_size, output_size, kernel_size, stride, pad, dilation, fractional_stride, output_pad, groups=1, debug=False, ): """ Compute a 2D convolution. Note that all PyTorch numbers are ordered (C, H, W) """ assert data.shape == tuple(input_size) in_channels = input_size[0] out_channels = output_size[0] if debug: # Slow route using pure Python ref = np.full(shape=output_size, fill_value=np.nan, dtype=np.int64) debug_print('k,c,x,y,weight,data,prod,cacc,acc') for k in range(out_channels): for y in range(-pad[0], input_size[1] - dilation[0] * (kernel_size[0] - 1) + pad[0], stride[0]): for y_frac in range(fractional_stride[0]): for x in range(-pad[1], input_size[2] - dilation[1] * (kernel_size[1] - 1) + pad[1], stride[1]): for x_frac in range(fractional_stride[1]): val = np.int64(0) c = 0 while True: dc = c if groups == 1 else c + k * (in_channels // groups) sval = np.int(0) for h in range(kernel_size[0]): for w in range(kernel_size[1]): ypos = (y + pad[0])*fractional_stride[0] - pad[0] \ + y_frac + h * dilation[0] yd, yr = divmod(ypos, fractional_stride[0]) xpos = (x + pad[1])*fractional_stride[1] - pad[1] \ + x_frac + w * dilation[1] xd, xr = divmod(xpos, fractional_stride[1]) if yr == 0 and 0 <= yd < input_size[1] and \ xr == 0 and 0 <= xd < input_size[2]: prod = weight[k][c][h][w] * data[dc][yd][xd] sval += prod val += prod stats.true_macc += 1 debug_print( f'{k},{c},{x},{y},{weight[k][c][h][w]},' f'{data[dc][yd][xd]},{prod},{sval},{val}' ) c += 16 if c >= in_channels // groups: c = (c + 1) % 16 if c in (0, in_channels // groups): break if bias is not None: val += bias[k] debug_print( f' adding bias: {bias[k]} -> result: {val}' ) ref[k][ ((y + pad[0])*fractional_stride[0] + y_frac) // stride[0] ][ ((x + pad[1])*fractional_stride[1] + x_frac) // stride[1] ] = val # Fast computation using NumPy # Stretch data for fractionally-strided convolution if fractional_stride[0] > 1 or fractional_stride[1] > 1: ndata = np.zeros((data.shape[0], data.shape[1] * fractional_stride[0], data.shape[2] * fractional_stride[1]), dtype=data.dtype) ndata[:, 0::fractional_stride[0], 0::fractional_stride[1]] = data data = ndata # Create zero padding around data and stretch weights for dilation. if pad[0] or pad[1] or output_pad[0] or output_pad[1]: data = np.pad(data, pad_width=((0, 0), (pad[0], pad[0]), (pad[1], pad[1])), mode='constant', constant_values=0) if dilation[0] > 1 or dilation[1] > 1: nweight = np.zeros((weight.shape[0], weight.shape[1], (kernel_size[0] - 1) * dilation[0] + 1, (kernel_size[1] - 1) * dilation[1] + 1), dtype=weight.dtype) nweight[:, :, 0::dilation[0], 0::dilation[1]] = weight weight = nweight h = (data.shape[1] - weight.shape[3] + 1) // stride[0] # Resulting output height w = (data.shape[2] - weight.shape[2] + 1) // stride[1] # Resulting output width view = as_strided(data, shape=(h, w, data.shape[0], weight.shape[2], weight.shape[3]), strides=((data.strides[1] * stride[0], data.strides[2] * stride[1], data.strides[0], data.strides[1], data.strides[2])), writeable=False) if groups > 1: nweight = np.zeros((weight.shape[0], in_channels, weight.shape[2], weight.shape[3]), dtype=weight.dtype) for i in range(weight.shape[0]): for j in range(in_channels // groups): nweight[i, i * (in_channels // groups) + j, :, :] = weight[i, j, :, :] weight = nweight output = np.tensordot(view, weight, axes=((2, 3, 4), (1, 2, 3))).transpose(2, 0, 1) # Apply bias if bias is not None: for k in range(out_channels): output[k] += bias[k] if debug: if not (ref == output).all(): eprint('NumPy <-> Python mismatch in compute.conv2d') sys.exit(1) assert output.shape == tuple(output_size) return output
def load( verbose, # pylint: disable=unused-argument embedded_code, apb, layers, bias, quantization, # pylint: disable=unused-argument group_map, output_chan, streaming, debug, # pylint: disable=unused-argument ): """ Write `bias` values for the network to C code. """ # Bias: Each group has one bias memory (size BIAS_SIZE bytes). Use only the bias memory in # one selected group for the layer, and only if the layer uses a bias. Keep track of the # offsets so they can be programmed into the mask count register later. if embedded_code: bias_values = np.zeros((tc.dev.P_NUMGROUPS, tc.dev.BIAS_SIZE), dtype=np.int64) group_bias_max = [0] * tc.dev.P_NUMGROUPS bias_offs = [None] * layers bias_group = [None] * layers for ll in range(layers): if bias[ll] is None: continue if len(bias[ll]) != output_chan[ll]: eprint( f'Layer {ll}: output channel count {output_chan[ll]} does not match the number ' f'of bias values {len(bias[ll])}.') sys.exit(1) q = 8 # Fixed to 8 bits instead of quantization[ll] qfactor = 8 // q # Round up the divided length of bias values # FIXME: Is it necessary to handle gaps in the next layer? bias_len = (output_chan[ll] + qfactor - 1) // qfactor if ll == 0 and streaming[ll] and tc.dev.FIX_STREAM_BIAS: # Work around a problem on AI85 bias_len += 1 if streaming[ll] and tc.dev.FIX_STREAM_BIAS: eprint( f'Layer {ll} uses streaming and a bias. ' 'THIS COMBINATION MIGHT NOT BE FUNCTIONING CORRECTLY!!!', error=False) # Pick the group with the least amount of data in it group = argmin(group_bias_max[t] for t in group_map[ll]) if group_bias_max[group] + bias_len > tc.dev.BIAS_SIZE: eprint( f'Layer {ll}: bias memory capacity exceeded - available groups: ' f'{group_map[ll]}, used so far: {group_bias_max}, needed: {bias_len}.' ) sys.exit(1) bias_group[ll] = group bias_offs[ll] = group_bias_max[group] # Each layer has output_channel number of bias values i = 0 target_offs = 0 if ll == 0 and streaming[ll] and tc.dev.FIX_STREAM_BIAS: # Work around a problem on AI85 if not embedded_code: apb.write_bias(group, bias_offs[ll], 0) else: # Store for later bias_values[group][bias_offs[ll]] = 0 target_offs += 1 while i < output_chan[ll]: b = combine(bias[ll], q, i, output_chan[ll]) if not embedded_code: apb.write_bias(group, bias_offs[ll] + target_offs, b) else: # Store for later bias_values[group][bias_offs[ll] + target_offs] = b & 0xff i += qfactor target_offs += 1 group_bias_max[group] += bias_len if embedded_code: if max(group_bias_max) > 0: # At least one bias value exists, output defines for group in range(tc.dev.P_NUMGROUPS): if group_bias_max[group] == 0: continue # but not for this group apb.output_define(bias_values[group][:group_bias_max[group]], f'BIAS_{group}', '0x%02x', 16) # Output variables for group in range(tc.dev.P_NUMGROUPS): if group_bias_max[group] == 0: continue apb.output( f'static const uint8_t bias_{group}[] = BIAS_{group};\n') apb.output('\n') # Finally, create function and do memcpy() apb.output( 'void memcpy_8to32(uint32_t *dst, const uint8_t *src, size_t n)\n{\n' ) apb.output(' while (n-- > 0) {\n *dst++ = *src++;\n }\n}\n\n') apb.output('void load_bias(void)\n{\n') for group in range(tc.dev.P_NUMGROUPS): if group_bias_max[group] == 0: continue addr = apb.apb_base + tc.dev.C_GROUP_OFFS * group + tc.dev.C_BRAM_BASE apb.output( f' memcpy_8to32((uint32_t *) 0x{addr:08x}, bias_{group}, ' f'sizeof(uint8_t) * {group_bias_max[group]});\n') apb.output('}\n\n') return bias_offs, bias_group, group_bias_max
def extract_funct(asm_file, funct_name, line_num, dwarf_loc): """Constructs a function from the assembly file. File pointer must point at first instruction of the function. The return dictionary and target list of site are not built here. Only fields initialized in a function's contstructor are initialized. However, each site of a function has its return dictionary linked to the function's return dictionary """ start_line_num = line_num call_list = ["call", "callf", "callq"] returns = ["ret", "retf", "iret", "retq", "iretq"] jmp_list = [ "jo", "jno", "jb", "jnae", "jc", "jnb", "jae", "jnc", "jz", "je", "jnz", "jne", "jbe", "jna", "jnbe", "ja", "js", "jns", "jp", "jpe", "jnp", "jpo", "jl", "jnge", "jnl", "jge", "jle", "jng", "jnle", "jg", "jecxz", "jrcxz", "jmp", "jmpe" ] CALL_SITE, RETURN_SITE, INDIR_JMP_SITE, PLT_SITE, = 0, 1, 2, 3 asm_line = asm_file.readline() line_num += 1 try: first_word = asm_line.split()[0] except IndexError: pass # ignore empty line comment_continues = False sites = [] direct_call_sites = [] empty_ret_dict = dict() while asm_line: asm_parsing.update_dwarf_loc(asm_line, dwarf_loc) try: first_word = asm_line.split()[0] except IndexError: # ignore empty line asm_line = asm_file.readline() line_num += 1 continue if first_word[:len('.LFE')] == '.LFE': break else: targets = [] labels, key_symbol, arg_str, comment_continues = ( asm_parsing.decode_line(asm_line, comment_continues)) if key_symbol in call_list: new_site = funct_cfg.Site(line_num, targets, CALL_SITE, dwarf_loc) if '%' not in arg_str: new_site.targets.append(arg_str) direct_call_sites.append(new_site) sites.append(new_site) elif key_symbol in returns: # empty return dict passed so that every site's return dict is # a reference to the function's return dict new_ret_site = funct_cfg.Site(line_num, empty_ret_dict, RETURN_SITE, dwarf_loc) #new_ret_site.cdi_return_sites.append(funct_cfg.CDIRetSite(asm_file)) sites.append(new_ret_site) elif key_symbol in jmp_list: if '%' in arg_str: sites.append( funct_cfg.Site(line_num, targets, INDIR_JMP_SITE, dwarf_loc)) asm_line = asm_file.readline() line_num += 1 else: eprint( dwarf_loc.filename() + ':' + asm_file.name + ':' + start_line_num + ' error: unterminated function: ', funct_name) sys.exit(1) fn = dwarf_loc.filename() new_funct = funct_cfg.Function(funct_name, asm_file.name, fn, sites, start_line_num) new_funct.direct_call_sites = direct_call_sites new_funct.ret_dict = empty_ret_dict return new_funct, line_num
def create_net( # pylint: disable=too-many-arguments,too-many-locals,too-many-branches prefix, verbose, verbose_all, debug, log, layers, operator, auto_input_dim, input_dim, pooled_dim, output_dim, kernel_size, quantization, # pylint: disable=unused-argument output_shift, input_chan, output_chan, conv_groups, output_width, padding, dilation, stride, pool, pool_stride, pool_average, activation, data, kernel, bias, fc_weights, fc_bias, flatten, operands, eltwise, pool_first, in_sequences, c_filename, base_directory, log_filename, weight_filename, sample_filename, avg_pool_rounding, device=84, legacy_test=False, ): """ Create the CMSIS NN network. """ if output_width[-1] != 8: eprint( 'CMSIS network generator does not currently support `output_width` that is not 8. ' 'Forcing to 8 bit.', error=False) # FIXME: Support 32-bit output output_width[-1] = 8 input_dim_str = [None] * layers output_dim_str = [None] * layers kernel_size_str = [None] * layers pool_str = [None] * layers padding_str = [None] * layers pool_stride_str = [None] * layers stride_str = [None] * layers for ll in range(layers): if quantization[ll] is None: quantization[ll] = 8 # Set default elif quantization[ll] != 8: # FIXME: Support quantization eprint( 'CMSIS network generator does not currently support `quantization` != 8.' ) sys.exit(1) if output_shift[ll] is None: output_shift[ll] = 0 # Set default if operator[ll] != op.CONV1D: input_dim_str[ll] = f'{input_dim[ll][0]}x{input_dim[ll][1]}' output_dim_str[ll] = f'{output_dim[ll][0]}x{output_dim[ll][1]}' kernel_size_str[ll] = f'{kernel_size[ll][0]}x{kernel_size[ll][1]}' pool_str[ll] = f'{pool[ll][0]}x{pool[ll][1]}' \ if pool[ll][0] > 1 or pool[ll][1] > 1 else '0x0' padding_str[ll] = f'{padding[ll][0]}/{padding[ll][1]}' pool_stride_str[ll] = f'{pool_stride[ll][0]}/{pool_stride[ll][1]}' stride_str[ll] = f'{stride[ll][0]}/{stride[ll][1]}' else: input_dim_str[ll] = f'{input_dim[ll][0]}' output_dim_str[ll] = f'{output_dim[ll][0]}' kernel_size_str[ll] = f'{kernel_size[ll][0]}' pool_str[ll] = f'{pool[ll][0]}' \ if pool[ll][0] > 1 or pool[ll][1] > 1 else '0' padding_str[ll] = f'{padding[ll][0]}' pool_stride_str[ll] = f'{pool_stride[ll][0]}' stride_str[ll] = f'{stride[ll][0]}' if input_chan[ll] % conv_groups[ll] != 0 or output_chan[ ll] % conv_groups[ll] != 0: eprint( f'Layer {ll}: convolution groups {conv_groups[ll]} does not divide' f' the input channels {input_chan[ll]} or output channels {output_chan[ll]}.' ) sys.exit(1) test_name = prefix print(f'{test_name}...') os.makedirs(os.path.join(base_directory, test_name), exist_ok=True) # Redirect stdout? if log: sys.stdout = open( os.path.join(base_directory, test_name, log_filename), 'w') print(f'{" ".join(str(x) for x in sys.argv)}') print(f'{devices.partnum(device)}\n') print(f'{test_name}') filename = c_filename + '.c' sampledata_header = \ open(os.path.join(base_directory, test_name, sample_filename), mode='w') weight_header = \ open(os.path.join(base_directory, test_name, weight_filename), mode='w') with open(os.path.join(base_directory, test_name, filename), mode='w') as c_file: toplevel.copyright_header(c_file) c_file.write(f'// {test_name}\n') c_file.write( f'// Created using {" ".join(str(x) for x in sys.argv)}\n') c_file.write('\n') toplevel.header(c_file, 0, embedded_code=True, cmsis_nn=True) # Pre-define data memory loader. d = data.transpose((1, 2, 0)).flatten() # CHW -> HWC toplevel.c_define(sampledata_header, d, 'INPUT_DATA', '%d', 16) input_size = d.size c_file.write('static const q7_t input_data[] = INPUT_DATA;\n') c_file.write( f'static const q{output_width[-1]-1}_t output_data[] = OUTPUT_DATA; ' '// Last conv layer output\n') # Pre-define the kernels and bias values for ll in range(layers): # Rearrange kernels when emulating a fully connected network using 1x1 Conv2D # CMSIS data uses HWC, PyTorch uses CHW if operator[ll] != op.NONE: if kernel_size[ll] == [1, 1] and input_dim[ll] == [1, 1]: w = kernel[ll]. \ reshape((output_chan[ll], input_chan[ll] // (auto_input_dim[ll][0] * auto_input_dim[ll][1]), auto_input_dim[ll][0], auto_input_dim[ll][1], kernel_size[ll][0], kernel_size[ll][1])). \ transpose((0, 4, 5, 2, 3, 1)). \ flatten() elif flatten[ll]: w = kernel[ll]. \ reshape((output_chan[ll], input_chan[ll], auto_input_dim[ll][0], auto_input_dim[ll][1], kernel_size[ll][0], kernel_size[ll][1])). \ transpose((0, 4, 5, 2, 3, 1)). \ flatten() else: w = kernel[ll]. \ reshape((output_chan[ll], input_chan[ll], kernel_size[ll][0], kernel_size[ll][1])). \ transpose((0, 2, 3, 1)). \ flatten() toplevel.c_define(weight_header, w, f'WEIGHTS_{ll}', '%d', 16) if bias[ll] is not None: b = bias[ll].flatten() else: # We need empty bias values (the Arm code needs them both for rounding of # the shifted output, and it does not like NULL bias pointers) b = np.zeros(output_chan[ll], dtype=np.int64) toplevel.c_define(weight_header, b, f'BIAS_{ll}', '%d', 16) c_file.write('\n') for ll in range(layers): if operator[ll] != op.NONE: c_file.write( f'static const q7_t weights_{ll}[] = WEIGHTS_{ll};\n') c_file.write(f'static const q7_t bias_{ll}[] = BIAS_{ll};\n') c_file.write('\n') # Compute buffer sizes col_buffer_size = 0 img_buffer_size = 0 for ll in range(layers): col_buffer_size = max( col_buffer_size, 2 * input_chan[ll] * kernel_size[ll][0] * kernel_size[ll][1]) if pool[ll][0] > 1 or pool[ll][1] > 1: col_buffer_size = max(col_buffer_size, pooled_dim[ll][0] * input_chan[ll]) # q15_t doesn't need 2* img_buffer_size = max( img_buffer_size, input_chan[ll] * input_dim[ll][0] * input_dim[ll][1], output_chan[ll] * output_dim[ll][0] * output_dim[ll][1]) c_file.write( f'static q7_t buffer0[{max(img_buffer_size, input_size)}];\n') c_file.write(f'static q7_t buffer1[{img_buffer_size}];\n') c_file.write(f'static q15_t col_buffer[{col_buffer_size}];\n\n') c_file.write('int cnn_run(const q7_t *input, int input_size, ' 'q7_t **output, int *output_size)\n{\n') # Compute layer-by-layer output and chain results into input buffer0, buffer1 = 'buffer0', 'buffer1' def run_eltwise( data, ll, ): """ In-flight element-wise operations """ if operator[ll] == op.NONE: # Let element-wise do 32-bit, else 8-bit only o_width = output_width[ll] else: o_width = 8 d_shape = data.shape data, out_size = eltwise_layer( eltwise[ll], ll, verbose, verbose_all or ll == layers - 1, data[0].shape, output_shift[ll], data, output_width=o_width, device=device, debug=False, operands=operands[ll], ) assert out_size[0] == d_shape[1] \ and out_size[1] == d_shape[2] and out_size[2] == d_shape[3] return data data_buf = [data] # Compute layer-by-layer output and chain results into input for ll in range(layers): # Concatenate input data if needed if in_sequences[ll] is not None: if isinstance(in_sequences[ll], list): try: data = np.concatenate( [data_buf[i + 1] for i in in_sequences[ll]], axis=0) except ValueError as err: eprint('Error in input data concatenation layer:', err) sys.exit(1) else: data = data_buf[in_sequences[ll] + 1] else: data = data_buf[-1] # Split data into multiple inputs if needed if operands[ll] > 1: if ll == 0 and legacy_test: data = np.array(np.split(data, operands[ll], axis=0)) elif legacy_test: d = np.empty((operands[ll], data.shape[0], data.shape[1], data.shape[2] // operands[ll]), dtype=np.int64) for i in range(operands[ll]): d[i, :, :, :] = data[:, :, i::operands[ll]] data = d else: data = np.array(np.split(data, operands[ll], axis=0)) else: data = np.expand_dims(data, 0) show_data( ll, verbose, verbose_all or ll == layers - 1, data.shape, data, debug=False, expand=1, expand_thresh=1, operation=operator[ll], operands=operands[ll], ) in_chan = input_chan[ll] # Run in-flight element-wise operations first? if operands[ll] > 1 and not pool_first[ll]: eprint( "Element-wise operations are currently not implemented for CMSIS-NN" ) sys.exit(1) # FIXME: Support element-wise operations data = np.expand_dims(run_eltwise(data, ll), 0) # Allow 1D <-> 2D and 2D W/L conversions if operator[ll] == op.CONV1D: assert input_dim[ll][1] == 1 data = data.reshape(data.shape[0], data.shape[1], input_dim[ll][0]) else: data = data.reshape(data.shape[0], data.shape[1], input_dim[ll][0], input_dim[ll][1]) # In-flight pooling data, out_size = pooling_layer( ll, verbose, verbose_all or ll == layers - 1, data[0].shape, pool[ll], pool_stride[ll], pool_average[ll], data, debug=False, expand=1, expand_thresh=1, operation=operator[ll], operands=data.shape[0], rounding=avg_pool_rounding, debug_data=None, ) if operator[ll] == op.CONV1D: assert out_size[0] == in_chan \ and out_size[1] == pooled_dim[ll][0] \ and pooled_dim[ll][1] == 1 else: assert out_size[0] == in_chan \ and out_size[1] == pooled_dim[ll][0] \ and out_size[2] == pooled_dim[ll][1] if operands[ll] > 1 and pool_first[ll]: data = run_eltwise(data, ll) else: data = np.squeeze(data, axis=0) # Convolution or passthrough if operator[ll] == op.CONV2D: if flatten[ll]: in_chan *= input_dim[ll][0] * input_dim[ll][1] data = data.reshape(in_chan, 1, 1) if verbose: print(f"FLATTEN TO {in_chan}x1x1...\n") out_buf, out_size = conv2d_layer( ll, verbose, verbose_all or ll == layers - 1, data.shape, kernel_size[ll], output_shift[ll], output_chan[ll], padding[ll], dilation[ll], stride[ll], activation[ll], kernel[ll].reshape(output_chan[ll], in_chan, kernel_size[ll][0], kernel_size[ll][1]), bias[ll], data, output_width=output_width[ll], groups=conv_groups[ll], device=device, debug=False, ) elif operator[ll] == op.CONVTRANSPOSE2D: out_buf, out_size = convtranspose2d_layer( ll, verbose, verbose_all or ll == layers - 1, data.shape, kernel_size[ll], output_shift[ll], output_chan[ll], padding[ll], dilation[ll], stride[ll], [1, 1], # output_padding activation[ll], kernel[ll].reshape( output_chan[ll], in_chan, kernel_size[ll][0], kernel_size[ll][1], ), bias[ll], data, output_width=output_width[ll], groups=conv_groups[ll], device=device, debug=False, ) elif operator[ll] == op.CONV1D: out_buf, out_size = conv1d_layer( ll, verbose, verbose_all or ll == layers - 1, data.shape, kernel_size[ll][0], output_shift[ll], output_chan[ll], padding[ll][0], dilation[ll][0], stride[ll][0], activation[ll], kernel[ll].reshape( output_chan[ll], input_chan[ll], kernel_size[ll][0], ), bias[ll], data, output_width=output_width[ll], groups=conv_groups[ll], device=device, debug=False, ) elif operator[ll] == op.NONE: # '0'D (pooling only or passthrough) out_buf, out_size = passthrough_layer( ll, verbose, verbose_all or ll == layers - 1, data.shape, data, device=device, debug=False, ) else: eprint(f'Unknown operator `{op.string(operator[ll])}`.') sys.exit(1) assert out_size[0] == output_chan[ll] \ and out_size[1] == output_dim[ll][0] and out_size[2] == output_dim[ll][1] c_file.write(f' // Layer {ll}: ' f'{str(operands[ll])+"x" if operands[ll] > 1 else ""}' f'{input_chan[ll]}x{input_dim_str[ll]}' f'{" flattened, " if flatten[ll] else ", "}') if pool[ll][0] > 1 or pool[ll][1] > 1: c_file.write( f'{pool_str[ll]} {"avg" if pool_average[ll] else "max"} ' f'pool with stride {pool_stride_str[ll]}') else: c_file.write('no pooling') if operator[ll] in [op.CONV1D, op.CONV2D, op.CONVTRANSPOSE2D]: conv_str = f', {op.string(operator[ll])} with kernel size ' \ f'{kernel_size_str[ll]}, ' \ f'stride {stride_str[ll]}, ' \ f'pad {padding_str[ll]}, ' else: conv_str = ', no convolution, ' c_file.write(conv_str + f'{output_chan[ll]}x{output_dim_str[ll]} output\n') c_file.write( f' // Dimensions: [{input_chan[ll]}, {input_dim[ll][0]}, ' f'{input_dim[ll][1]}]') if pool[ll][0] > 1 or pool[ll][1] > 1: c_file.write( f' -> [{input_chan[ll]}, {pooled_dim[ll][0]}, {pooled_dim[ll][1]}]' ) if flatten[ll]: c_file.write( f' -> [{input_chan[ll]*pooled_dim[ll][0]*pooled_dim[ll][1]}, 1, 1]' ) if operator[ll] != op.NONE: c_file.write(f' -> {out_size}\n') else: c_file.write('\n') source = 'input_data' if ll == 0 else buffer0 if pool[ll][0] > 1 or pool[ll][1] > 1: if ll == 0: c_file.write(' memcpy(buffer0, input, input_size);' ' // Pooling may destroy input\n') pool_type = 'ave' if pool_average[ll] else 'max' if pool[ll][0] != pool[ll][1]: c_file.write( f' arm_{pool_type}pool_nonsquare_q7_HWC_nonsquare({buffer0}, ' f'{input_dim[ll][1]}, {input_dim[ll][0]}, ' f'{input_chan[ll]}, {pool[ll][1]}, {pool[ll][0]}, 0, 0, ' f'{pool_stride[ll][1]}, {pool_stride[ll][0]}, ' f'{pooled_dim[ll][1]}, {pooled_dim[ll][0]}, ' f'(q7_t *) col_buffer, {buffer1});\n') else: if input_dim[ll][0] == input_dim[ll][1]: c_file.write( f' arm_{pool_type}pool_q7_HWC({buffer0}, ' f'{input_dim[ll][0]}, {input_chan[ll]}, ' f'{pool[ll][0]}, 0, {pool_stride[ll][0]}, ' f'{pooled_dim[ll][0]}, (q7_t *) col_buffer, {buffer1});\n' ) else: c_file.write( f' arm_{pool_type}pool_q7_HWC_nonsquare({buffer0}, ' f'{input_dim[ll][1]}, {input_dim[ll][0]}, ' f'{input_chan[ll]}, {pool[ll][0]}, 0, {pool_stride[ll][0]}, ' f'{pooled_dim[ll][1]}, {pooled_dim[ll][0]}, ' f'(q7_t *) col_buffer, {buffer1});\n') source = buffer1 buffer0, buffer1 = buffer1, buffer0 if operator[ll] != op.NONE: in_chan = input_chan[ll] in_dim = pooled_dim[ll] if flatten[ll]: in_chan *= pooled_dim[ll][0] * pooled_dim[ll][1] in_dim = [1, 1] if operator[ll] in [op.CONVTRANSPOSE2D ]: # FIXME: Support ConvTranspose2d eprint( "CMSIS-NN generator does not currently support the operator " f"`{op.string(operator[ll])}` in layer {ll}") sys.exit(1) # FIXME: First check that everything is [-128, +127] and use s8 function otherwise # Check for squareness if kernel_size[ll][0] == kernel_size[ll][1] \ and in_dim[0] == in_dim[1] \ and output_dim[ll][0] == output_dim[ll][1] \ and padding[ll][0] == padding[ll][1] \ and stride[ll][0] == stride[ll][1]: # Detect fully connected layers if in_dim == [1, 1] and output_dim[ll] == [1, 1]: c_file.write( f' arm_fully_connected_q7({source}, ' f'weights_{ll}, {in_chan}, {output_chan[ll]}, 7, 7, ' f'bias_{ll}, {buffer1}, ' 'col_buffer);\n') else: fn = 'fast' if in_chan % 4 == 0 and output_chan[ll] % 2 == 0 \ else 'basic' c_file.write( f' arm_convolve_HWC_q7_{fn}({source}, ' f'{in_dim[0]}, ' f'{in_chan}, weights_{ll}, {output_chan[ll]}, ' f'{kernel_size[ll][0]}, ' f'{padding[ll][0]}, ' f'{stride[ll][0]}, ' f'bias_{ll}, 7, 7, {buffer1}, ' f'{output_dim[ll][0]}, ' 'col_buffer, NULL);\n') else: c_file.write( f' arm_convolve_HWC_q7_basic_nonsquare({source}, ' f'{in_dim[1]}, {in_dim[0]}, ' f'{in_chan}, weights_{ll}, {output_chan[ll]}, ' f'{kernel_size[ll][1]}, {kernel_size[ll][0]}, ' f'{padding[ll][1]}, {padding[ll][0]}, ' f'{stride[ll][1]}, {stride[ll][0]},\n' ' ' f'bias_{ll}, 7, 7, {buffer1}, ' f'{output_dim[ll][1]}, {output_dim[ll][0]}, ' 'col_buffer, NULL);\n') assert out_size[0] == output_chan[ll] \ and out_size[1] == output_dim[ll][0] and out_size[2] == output_dim[ll][1] if activation[ll] == op.ACT_RELU: size = output_dim[ll][0] * output_dim[ll][1] * output_chan[ ll] if size < 65536: c_file.write(f' arm_relu_q7({buffer1}, {size});\n') else: c_file.write(f' arm_relu32_q7({buffer1}, {size});\n') elif activation[ ll] is not None: # FIXME: Support abs() activation eprint("CMSIS-NN generator implements ReLU only.") sys.exit(1) buffer0, buffer1 = buffer1, buffer0 data_buf.append(out_buf.reshape(out_size)) c_file.write('\n') data_cmsis = data_buf[-1].transpose((1, 2, 0)).flatten() if verbose: print('TRANSPOSED (HWC) AND FLATTENED:') print(data_cmsis) print('') data = data_buf[-1] c_file.write(f' *output = {buffer0};\n' f' *output_size = {data_cmsis.size};\n\n' ' return 1;\n}\n\n') if fc_weights: data = data.flatten() out_buf, out_size = linear_layer(verbose=verbose, verbose_data=False, activation=False, weight=fc_weights[0], bias=fc_bias[0], data=data, debug=debug) # Rearrange the weights to account for the shape of the conv layer output w = fc_weights[0]. \ reshape((fc_weights[0].shape[0], output_chan[ll], output_dim[ll][0], output_dim[ll][1])). \ transpose(0, 2, 3, 1). \ reshape((fc_weights[0].shape[0], fc_weights[0].shape[1])) # np.dot(worg, torg.flatten()) should be equal to np.dot(wnew, tnew.flatten()) assert (np.dot(fc_weights[0], data) == np.dot(w, data_cmsis)).all() toplevel.fc_layer(c_file, weight_header, w, fc_bias[0], cmsis_nn=True) c_file.write( 'int main(void)\n{\n' ' int i;\n' ' q7_t *output;\n' ' int output_size;\n\n' f' cnn_run(input_data, {input_size}, &output, &output_size);\n\n') toplevel.c_define(sampledata_header, data_cmsis, 'OUTPUT_DATA', '%d', 16) c_file.write(' if (memcmp(output_data, output, output_size) == 0)\n' ' printf("*** PASS ***\\n\\n");\n' ' else\n' ' printf("!!! FAIL !!!\\n\\n");\n\n') if fc_weights: c_file.write(' fc_layer(output);\n\n') c_file.write( ' printf("Classification results:\\n");\n' ' for (i = 0; i < NUM_CLASSES; i++) {\n' ' printf("[%6d] -> Class %d: %0.1f%%\\n", fc_output[i], i, ' '(double) (100.0 * ml_softmax[i] / 32768.0));\n' ' }\n\n') else: c_file.write(' printf("Output of final layer:\\n");\n' ' for (i = 0; i < output_size; i++) {\n' ' printf("%5hhd", (int8_t) (output[i] & 0xff));\n' ' if ((i + 1) % 32 == 0)\n printf("\\n");\n' ' else if ((i + 1) % 4 == 0)\n printf(" ");\n' ' }\n' ' printf("\\n");\n' '\n') c_file.write(' return 0;\n}\n\n') # Close header files sampledata_header.close() weight_header.close() assets.copy('assets', 'cmsis-nn', base_directory, test_name)
target_host = a if o == '-p': target_port = int(a) if o == '-l': listen_port = int(a) listenTuple = ('', listen_port) listenSocket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) listenSocket.setblocking(0) listenSocket.bind(listenTuple) sendTuple = (target_host, target_port) sendSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sendSocket.connect(sendTuple) eprint("Listening on port %d" % (listen_port)) eprint("Sending to %s port %d" % (target_host, target_port)) for msgDict in generateDicts(listenSocket): if 'date' not in msgDict: msgDict['date'] = makeDate('now') else: msgDict['date'] = makeDate(msgDict['date'].rstrip(':')) if 'msg_type' not in msgDict: msgDict['msg_type'] = msgDict['message'].partition(' ')[0].partition( '[')[0] if 'id' in msgDict: msgDict['instance'] = msgDict['id'] + '_' + msgDict['host'] if 'key_fields' in msgDict:
def create_net( prefix, verbose, debug, log, layers, operator, auto_input_dim, input_dim, pooled_dim, output_dim, kernel_size, quantization, # pylint: disable=unused-argument output_shift, input_chan, output_chan, output_width, padding, dilation, stride, pool, pool_stride, pool_average, activate, data, kernel, bias, fc_weights, fc_bias, flatten, c_filename, base_directory, log_filename, weight_filename, sample_filename, device=84, ): """ Create the CMSIS NN network. """ if any(w != 8 for w in output_width): eprint('CMSIS network generator does not currently support `output_width` that is not 8.') sys.exit(1) test_name = prefix print(f'{test_name}...') os.makedirs(os.path.join(base_directory, test_name), exist_ok=True) # Redirect stdout? if log: sys.stdout = open(os.path.join(base_directory, test_name, log_filename), 'w') print(f'{test_name}') filename = c_filename + '.c' sampledata_header = \ open(os.path.join(base_directory, test_name, sample_filename), mode='w') weight_header = \ open(os.path.join(base_directory, test_name, weight_filename), mode='w') with open(os.path.join(base_directory, test_name, filename), mode='w') as c_file: toplevel.copyright_header(c_file) c_file.write(f'// {test_name}\n') c_file.write(f'// Created using {" ".join(str(x) for x in sys.argv)}\n') # Human readable description of test c_file.write(f'\n// Configuring {layers} layer{"s" if layers > 1 else ""}:\n') for ll in range(layers): c_file.write(f'// Layer {ll}: ' f'{input_chan[ll]}x{input_dim[ll][0]}x{input_dim[ll][1]}, ') if pool[ll][0] > 1 or pool[ll][1] > 1: c_file.write(f'{pool[ll][0]}x{pool[ll][1]} {"avg" if pool_average[ll] else "max"} ' f'pool with stride {pool_stride[ll]}') else: c_file.write('no pooling') c_file.write(f', {kernel_size[ll][0]}x{kernel_size[ll][1]} convolution ' f'with stride {stride[ll]} ' f'pad {padding[ll]}, ' f'{output_chan[ll]}x{output_dim[ll][0]}x{output_dim[ll][1]} out\n') c_file.write('\n') toplevel.header(c_file, 0, embedded_code=True, cmsis_nn=True) # Pre-define data memory loader. d = data.transpose((1, 2, 0)).flatten() # CHW -> HWC toplevel.c_define(sampledata_header, d, 'INPUT_DATA', '%d', 16) input_size = d.size c_file.write('static const q7_t input_data[] = INPUT_DATA;\n') c_file.write('static const q7_t output_data[] = OUTPUT_DATA; // Last conv layer output\n') # Pre-define the kernels and bias values for ll in range(layers): # Rearrange kernels when emulating a fully connected network using 1x1 Conv2D # CMSIS data uses HWC, PyTorch uses CHW if kernel_size[ll] == [1, 1] and input_dim[ll] == [1, 1]: w = kernel[ll]. \ reshape((output_chan[ll], input_chan[ll] // (auto_input_dim[ll][0] * auto_input_dim[ll][1]), auto_input_dim[ll][0], auto_input_dim[ll][1], kernel_size[ll][0], kernel_size[ll][1])). \ transpose((0, 4, 5, 2, 3, 1)). \ flatten() elif flatten[ll]: w = kernel[ll]. \ reshape((output_chan[ll], input_chan[ll], auto_input_dim[ll][0], auto_input_dim[ll][1], kernel_size[ll][0], kernel_size[ll][1])). \ transpose((0, 4, 5, 2, 3, 1)). \ flatten() else: w = kernel[ll]. \ reshape((output_chan[ll], input_chan[ll], kernel_size[ll][0], kernel_size[ll][1])). \ transpose((0, 2, 3, 1)). \ flatten() toplevel.c_define(weight_header, w, f'WEIGHTS_{ll}', '%d', 16) if bias[ll] is not None: b = bias[ll].flatten() else: # We need empty bias values (the Arm code needs them both for rounding of # the shifted output, and it does not like NULL bias pointers) b = np.zeros(output_chan[ll], dtype=np.int64) toplevel.c_define(weight_header, b, f'BIAS_{ll}', '%d', 16) c_file.write('\n') for ll in range(layers): c_file.write(f'static const q7_t weights_{ll}[] = WEIGHTS_{ll};\n') c_file.write(f'static const q7_t bias_{ll}[] = BIAS_{ll};\n') c_file.write('\n') # Compute buffer sizes col_buffer_size = 0 img_buffer_size = 0 for ll in range(layers): col_buffer_size = max(col_buffer_size, 2*input_chan[ll]*kernel_size[ll][0]*kernel_size[ll][1]) if pool[ll][0] > 1 or pool[ll][1] > 1: col_buffer_size = max(col_buffer_size, pooled_dim[ll][0]*input_chan[ll]) # q15_t doesn't need 2* img_buffer_size = max(img_buffer_size, input_chan[ll]*input_dim[ll][0]*input_dim[ll][1], output_chan[ll]*output_dim[ll][0]*output_dim[ll][1]) c_file.write(f'static q7_t buffer0[{max(img_buffer_size, input_size)}];\n') c_file.write(f'static q7_t buffer1[{img_buffer_size}];\n') c_file.write(f'static q15_t col_buffer[{col_buffer_size}];\n\n') c_file.write('int cnn_run(const q7_t *input, int input_size, ' 'q7_t **output, int *output_size)\n{\n') # Compute layer-by-layer output and chain results into input buffer0, buffer1 = 'buffer0', 'buffer1' for ll in range(layers): c_file.write(f' // Layer {ll}: [{input_chan[ll]}, {input_dim[ll][0]}, ' f'{input_dim[ll][1]}] -> ') if pool[ll][0] > 1 or pool[ll][1] > 1: c_file.write(f'[{input_chan[ll]}, {pooled_dim[ll][0]}, {pooled_dim[ll][1]}] -> ') # Add element-wise dimension data = np.expand_dims(data, 0) in_chan = input_chan[ll] # Allow 1D <-> 2D and 2D W/L conversions if operator[ll] == op.CONV1D: assert input_dim[ll][1] == 1 data = data.reshape(data.shape[0], data.shape[1], input_dim[ll][0]) else: data = data.reshape(data.shape[0], data.shape[1], input_dim[ll][0], input_dim[ll][1]) data, out_size = pooling_layer( ll, verbose, False, data[0].shape, pool[ll], pool_stride[ll], pool_average[ll], data, expand=1, expand_thresh=16384, operation=operator[ll], operands=data.shape[0], rounding=False, debug=debug, ) if operator[ll] == op.CONV1D: assert out_size[0] == in_chan \ and out_size[1] == pooled_dim[ll][0] \ and pooled_dim[ll][1] == 1 else: assert out_size[0] == in_chan \ and out_size[1] == pooled_dim[ll][0] \ and out_size[2] == pooled_dim[ll][1] # Get rid of element-wise dimension data = np.squeeze(data, axis=0) if operator[ll] == op.CONV2D: if flatten[ll]: in_chan *= input_dim[ll][0] * input_dim[ll][1] data = data.reshape(in_chan, 1, 1) out_buf, out_size = conv2d_layer( ll, verbose, False, data.shape, kernel_size[ll], output_shift[ll], output_chan[ll], padding[ll], dilation[ll], stride[ll], activate[ll], kernel[ll].reshape( output_chan[ll], in_chan, kernel_size[ll][0], kernel_size[ll][1], ), bias[ll], data, device=device, debug=debug, ) else: out_buf, out_size = conv1d_layer( ll, verbose, False, data.shape, kernel_size[ll][0], output_shift[ll], output_chan[ll], padding[ll][0], dilation[ll][0], stride[ll][0], activate[ll], kernel[ll].reshape( output_chan[ll], in_chan, kernel_size[ll][0] ), bias[ll], data, device=device, debug=debug, ) c_file.write(f'{out_size}\n') source = 'input_data' if ll == 0 else buffer0 if pool[ll][0] > 1 or pool[ll][1] > 1: if ll == 0: c_file.write(' memcpy(buffer0, input, input_size);' ' // Pooling may destroy input\n') pool_type = 'ave' if pool_average[ll] else 'max' if pool[ll][0] != pool[ll][1]: c_file.write(f' arm_{pool_type}pool_nonsquare_q7_HWC_nonsquare({buffer0}, ' f'{input_dim[ll][1]}, {input_dim[ll][0]}, ' f'{input_chan[ll]}, {pool[ll][1]}, {pool[ll][0]}, 0, 0, ' f'{pool_stride[ll][1]}, {pool_stride[ll][0]}, ' f'{pooled_dim[ll][1]}, {pooled_dim[ll][0]}, ' f'(q7_t *) col_buffer, {buffer1});\n') else: if input_dim[ll][0] == input_dim[ll][1]: c_file.write(f' arm_{pool_type}pool_q7_HWC({buffer0}, ' f'{input_dim[ll][0]}, {input_chan[ll]}, ' f'{pool[ll][0]}, 0, {pool_stride[ll][0]}, ' f'{pooled_dim[ll][0]}, (q7_t *) col_buffer, {buffer1});\n') else: c_file.write(f' arm_{pool_type}pool_q7_HWC_nonsquare({buffer0}, ' f'{input_dim[ll][1]}, {input_dim[ll][0]}, ' f'{input_chan[ll]}, {pool[ll][0]}, 0, {pool_stride[ll][0]}, ' f'{pooled_dim[ll][1]}, {pooled_dim[ll][0]}, ' f'(q7_t *) col_buffer, {buffer1});\n') source = buffer1 buffer0, buffer1 = buffer1, buffer0 # Check for squareness if kernel_size[ll][0] == kernel_size[ll][1] \ and pooled_dim[ll][0] == pooled_dim[ll][1] \ and output_dim[ll][0] == output_dim[ll][1] \ and padding[ll][0] == padding[ll][1] \ and stride[ll][0] == stride[ll][1]: fn = 'fast' if input_chan[ll] % 4 == 0 and output_chan[ll] % 2 == 0 else 'basic' c_file.write(f' arm_convolve_HWC_q7_{fn}({source}, ' f'{pooled_dim[ll][0]}, ' f'{input_chan[ll]}, weights_{ll}, {output_chan[ll]}, ' f'{kernel_size[ll][0]}, ' f'{padding[ll][0]}, ' f'{stride[ll][0]}, ' f'bias_{ll}, 0, 7, {buffer1}, ' f'{output_dim[ll][0]}, ' 'col_buffer, NULL);\n') else: c_file.write(f' arm_convolve_HWC_q7_basic_nonsquare({source}, ' f'{pooled_dim[ll][1]}, {pooled_dim[ll][0]}, ' f'{input_chan[ll]}, weights_{ll}, {output_chan[ll]}, ' f'{kernel_size[ll][1]}, {kernel_size[ll][0]}, ' f'{padding[ll][1]}, {padding[ll][0]}, ' f'{stride[ll][1]}, {stride[ll][0]},\n' ' ' f'bias_{ll}, 0, 7, {buffer1}, ' f'{output_dim[ll][1]}, {output_dim[ll][0]}, ' 'col_buffer, NULL);\n') assert out_size[0] == output_chan[ll] \ and out_size[1] == output_dim[ll][0] and out_size[2] == output_dim[ll][1] if activate[ll]: size = output_dim[ll][0] * output_dim[ll][1] * output_chan[ll] if size < 65536: c_file.write(f' arm_relu_q7({buffer1}, {size});\n') else: c_file.write(f' arm_relu32_q7({buffer1}, {size});\n') buffer0, buffer1 = buffer1, buffer0 data = out_buf.reshape(out_size) c_file.write('\n') data_cmsis = data.transpose((1, 2, 0)).flatten() if verbose: print('TRANSPOSED (HWC) AND FLATTENED:') print(data_cmsis) print('') c_file.write(f' *output = {buffer0};\n' f' *output_size = {data_cmsis.size};\n\n' ' return 1;\n}\n\n') if fc_weights: data = data.flatten() out_buf, out_size = linear_layer( verbose=verbose, verbose_data=False, activation=False, weight=fc_weights[0], bias=fc_bias[0], data=data, debug=debug ) # Rearrange the weights to account for the shape of the conv layer output w = fc_weights[0]. \ reshape((fc_weights[0].shape[0], output_chan[ll], output_dim[ll][0], output_dim[ll][1])). \ transpose(0, 2, 3, 1). \ reshape((fc_weights[0].shape[0], fc_weights[0].shape[1])) # np.dot(worg, torg.flatten()) should be equal to np.dot(wnew, tnew.flatten()) assert (np.dot(fc_weights[0], data) == np.dot(w, data_cmsis)).all() toplevel.fc_layer(c_file, weight_header, w, fc_bias[0], cmsis_nn=True) c_file.write('int main(void)\n{\n' ' int i;\n' ' q7_t *output;\n' ' int output_size;\n\n' f' cnn_run(input_data, {input_size}, &output, &output_size);\n\n') toplevel.c_define(sampledata_header, data_cmsis, 'OUTPUT_DATA', '%d', 16) c_file.write(' if (memcmp(output_data, output, output_size) == 0)\n' ' printf("*** PASS ***\\n\\n");\n' ' else\n' ' printf("!!! FAIL !!!\\n\\n");\n\n') if fc_weights: c_file.write(' fc_layer(output);\n\n') c_file.write(' printf("Classification results:\\n");\n' ' for (i = 0; i < NUM_CLASSES; i++) {\n' ' printf("[%6d] -> Class %d: %0.1f%%\\n", fc_output[i], i, ' '(double) (100.0 * ml_softmax[i] / 32768.0));\n' ' }\n\n') else: c_file.write(' printf("Output of final layer:\\n");\n' ' for (i = 0; i < output_size; i++) {\n' ' printf("%5hhd", (int8_t) (output[i] & 0xff));\n' ' if ((i + 1) % 32 == 0)\n printf("\\n");\n' ' else if ((i + 1) % 4 == 0)\n printf(" ");\n' ' }\n' ' printf("\\n");\n' '\n') c_file.write(' return 0;\n}\n\n') # Close header files sampledata_header.close() weight_header.close()
def parse(config_file, max_conv=None, device=84): # pylint: disable=unused-argument """ Configure network parameters from the YAML configuration file `config_file`. `max_conv` can be set to force an early termination of the parser. `device` is `84`, `85`, etc. The function returns both YAML dictionary, the length of the processor map, as well as a settings dictionary. """ def error_exit(message, sequence): """ Print error message `message` for layer sequence `sequence` and exit. """ eprint( f'{message} (found in layer sequence {sequence} in YAML configuration).' ) sys.exit(1) # Load configuration file with open(config_file) as cfg_file: print(f'Reading {config_file} to configure network...') cfg = yaml.load(cfg_file, Loader=UniqueKeyLoader) if bool( set(cfg) - set(['bias', 'dataset', 'layers', 'output_map', 'arch', 'weights']) ): eprint(f'Configuration file {config_file} contains unknown key(s).') sys.exit(1) if 'layers' not in cfg or 'arch' not in cfg or 'dataset' not in cfg: eprint(f'Configuration file {config_file} does not contain ' f'`layers`, `arch`, or `dataset`.') sys.exit(1) # These are initialized with 'None'. Use this to see whether a layer was configured, # will be auto-initialized to previous layer's value or a default. processor_map = [None] * tc.dev.MAX_LAYERS output_map = [None] * tc.dev.MAX_LAYERS input_offset = [None] * tc.dev.MAX_LAYERS input_chan = [None] * tc.dev.MAX_LAYERS input_dim = [None] * tc.dev.MAX_LAYERS output_chan = [None] * tc.dev.MAX_LAYERS # All other variables are initialized with the default values padding = [[1, 1]] * tc.dev.MAX_LAYERS pool = [[1, 1]] * tc.dev.MAX_LAYERS pooling_enabled = [False] * tc.dev.MAX_LAYERS average = [0] * tc.dev.MAX_LAYERS pool_stride = [[1, 1]] * tc.dev.MAX_LAYERS quantization = [None] * tc.dev.MAX_LAYERS bias_quantization = [8] * tc.dev.MAX_LAYERS output_shift = [None] * tc.dev.MAX_LAYERS output_offset = [0] * tc.dev.MAX_LAYERS activation = [None] * tc.dev.MAX_LAYERS big_data = [False] * tc.dev.MAX_LAYERS output_width = [8] * tc.dev.MAX_LAYERS operator = [op.CONV2D] * tc.dev.MAX_LAYERS # We don't support changing the following (yet), but leave as parameters: dilation = [[1, 1]] * tc.dev.MAX_LAYERS kernel_size = [DEFAULT_2D_KERNEL] * tc.dev.MAX_LAYERS conv_groups = [1] * tc.dev.MAX_LAYERS stride = [[1, 1]] * tc.dev.MAX_LAYERS streaming = [False] * tc.dev.MAX_LAYERS flatten = [False] * tc.dev.MAX_LAYERS operands = [1] * tc.dev.MAX_LAYERS eltwise = [op.NONE] * tc.dev.MAX_LAYERS pool_first = [True] * tc.dev.MAX_LAYERS in_sequences = [None] * tc.dev.MAX_LAYERS write_gap = [0] * tc.dev.MAX_LAYERS sequence = 0 for ll in cfg['layers']: if bool( set(ll) - set([ 'max_pool', 'avg_pool', 'convolution', 'conv_groups', 'in_channels', 'in_dim', 'in_sequences', 'in_offset', 'kernel_size', 'pool_stride', 'out_channels', 'out_offset', 'activate', 'activation', 'data_format', 'eltwise', 'flatten', 'op', 'operands', 'operation', 'operator', 'output_processors', 'output_width', 'output_shift', 'pool_first', 'processors', 'pad', 'quantization', 'sequence', 'streaming', 'stride', 'write_gap' ])): eprint( f'Configuration file {config_file} contains unknown key(s) for `layers`.' ) sys.exit(1) if 'sequence' in ll: sequence = ll['sequence'] # Override sequence information if processor_map[sequence]: error_exit('Layer was already specified', sequence) if 'processors' in ll: processor_map[sequence] = ll['processors'] if not processor_map[sequence]: error_exit('`processors` must not be zero or missing', sequence) if not isinstance(processor_map[sequence], int) \ or processor_map[sequence] >= 2**tc.dev.MAX_PROC: error_exit( f'`processors` must be an int from 0 to 2**{tc.dev.MAX_PROC}-1', sequence) if 'output_processors' in ll: output_map[sequence] = ll['output_processors'] if not output_map[sequence]: error_exit('output_processors` cannot be zero', sequence) if not isinstance(output_map[sequence], int) \ or output_map[sequence] >= 2**tc.dev.MAX_PROC: error_exit( '`output_processors` must be an int from 0 to ' f'2**{tc.dev.MAX_PROC}-1', sequence) if 'max_pool' in ll: val = ll['max_pool'] if not isinstance(val, list): pool[sequence] = [val, val] else: pool[sequence] = val pooling_enabled[sequence] = True elif 'avg_pool' in ll: val = ll['avg_pool'] if not isinstance(val, list): pool[sequence] = [val, val] else: pool[sequence] = val pooling_enabled[sequence] = True average[sequence] = 1 if 'pool_stride' in ll: val = ll['pool_stride'] if not isinstance(val, list): pool_stride[sequence] = [val, val] else: pool_stride[sequence] = val if 'quantization' in ll: val = ll['quantization'] if val not in [1, 2, 4, 8]: error_exit('`quantization` must be 1, 2, 4, or 8', sequence) quantization[sequence] = val if 'output_shift' in ll: val = ll['output_shift'] output_shift[sequence] = val # The implicit shift for quantization is added later if 'in_channels' in ll: input_chan[sequence] = ll['in_channels'] if 'in_dim' in ll: if isinstance(ll['in_dim'], list) and len(ll['in_dim']) > 2: error_exit('`in_dim` must not exceed two dimensions', sequence) input_dim[sequence] = ll['in_dim'] if 'in_offset' in ll: input_offset[sequence] = ll['in_offset'] if 'out_channels' in ll: output_chan[sequence] = ll['out_channels'] if 'out_offset' in ll: output_offset[sequence] = ll['out_offset'] else: print('WARNING: Defaulting to `out_offset = 0` for ' f'layer sequence {sequence} in YAML configuration.') if 'activate' in ll or 'activation' in ll: key = 'activate' if 'activate' in ll else 'activation' if ll[key].lower() == 'relu': activation[sequence] = op.ACT_RELU elif ll[key].lower() == 'abs': activation[sequence] = op.ACT_ABS elif ll[key].lower() == 'none': activation[sequence] = None else: error_exit(f'Unknown value "{ll[key]}" for `{key}`', sequence) sys.exit(1) if 'convolution' in ll or 'operation' in ll or 'op' in ll or 'operator' in ll: key = 'convolution' if 'convolution' in ll else \ 'operation' if 'operation' in ll else \ 'operator' if 'operator' in ll else \ 'op' conv = ll[key].lower() if conv == 'conv1d': operator[sequence] = op.CONV1D elif conv == 'conv2d': operator[sequence] = op.CONV2D elif conv == 'convtranspose2d': operator[sequence] = op.CONVTRANSPOSE2D elif conv in ['none', 'passthrough']: operator[sequence] = op.NONE padding[sequence] = [0, 0] elif conv == 'add': operator[sequence] = op.NONE eltwise[sequence] = op.ELTWISE_ADD operands[sequence] = 2 padding[sequence] = [0, 0] elif conv == 'or': operator[sequence] = op.NONE eltwise[sequence] = op.ELTWISE_OR operands[sequence] = 2 padding[sequence] = [0, 0] elif conv == 'sub': operator[sequence] = op.NONE eltwise[sequence] = op.ELTWISE_SUB operands[sequence] = 2 padding[sequence] = [0, 0] elif conv == 'xor': operator[sequence] = op.NONE eltwise[sequence] = op.ELTWISE_XOR operands[sequence] = 2 padding[sequence] = [0, 0] elif conv in ['linear', 'fc', 'mlp']: # Emulate using Conv2D with 1x1 kernels and 1x1 data operator[sequence] = op.CONV2D kernel_size[sequence] = FC_KERNEL padding[sequence] = [0, 0] else: error_exit(f'Unknown value "{ll[key]}" for `{key}`', sequence) sys.exit(1) else: print('WARNING: Defaulting to `op: Conv2d` for ' f'layer sequence {sequence} in YAML configuration.') if 'pad' in ll: val = ll['pad'] if val < 0: error_exit(f'Unsupported value {val} for `pad`', sequence) padding[sequence] = [val, val] if 'eltwise' in ll: conv = ll['eltwise'].lower() if conv == 'add': eltwise[sequence] = op.ELTWISE_ADD operands[sequence] = 2 elif conv == 'or': eltwise[sequence] = op.ELTWISE_OR operands[sequence] = 2 elif conv == 'sub': eltwise[sequence] = op.ELTWISE_SUB operands[sequence] = 2 elif conv == 'xor': eltwise[sequence] = op.ELTWISE_XOR operands[sequence] = 2 else: error_exit(f'Unknown value "{ll["eltwise"]}" for `eltwise`', sequence) sys.exit(1) if 'pool_first' in ll: val = ll['pool_first'] try: pool_first[sequence] = bool(val) except ValueError: error_exit(f'Unsupported value `{val}` for `pool_first`', sequence) if 'operands' in ll: if not op.eltwise(eltwise[sequence]): error_exit( '`operands` can only be used with element-wise operations', sequence) val = ll['operands'] if val < 2 or val > 16: error_exit('`operands` has to be 2..16', sequence) operands[sequence] = val if 'data_format' in ll: if sequence: error_exit( '`data_format` can only be configured for the first layer', sequence) val = ll['data_format'].lower() if val in ['chw', 'big']: big_data[sequence] = True elif val in ['hwc', 'little']: pass else: error_exit('Unknown value for `data_format`', sequence) if 'output_width' in ll: val = ll['output_width'] if val not in [8, 32]: error_exit('`output_width` must be 8 or 32', sequence) output_width[sequence] = val if 'kernel_size' in ll: if kernel_size[sequence] != DEFAULT_2D_KERNEL: error_exit( 'Cannot configure `kernel_size` for fully connected layers', sequence) val = str(ll['kernel_size']).lower() if operator[sequence] == op.CONV2D: if device == 84 and val not in ['3x3'] \ or device != 84 and val not in ['1x1', '3x3']: error_exit(f'Unsupported value `{val}` for `kernel_size`', sequence) kernel_size[sequence] = [int(val[0]), int(val[2])] elif operator[sequence] == op.CONVTRANSPOSE2D: if val not in ['3x3']: error_exit(f'Unsupported value `{val}` for `kernel_size`', sequence) kernel_size[sequence] = [int(val[0]), int(val[2])] else: try: val = int(val) except ValueError: error_exit(f'Unsupported value `{val}` for `kernel_size`', sequence) if device == 84 and val != 9 or val < 1 or val > 9: error_exit(f'Unsupported value `{val}` for `kernel_size`', sequence) kernel_size[sequence] = [val, 1] elif operator[sequence] == op.CONV1D: # Set default for 1D convolution kernel_size[sequence] = DEFAULT_1D_KERNEL if 'stride' in ll: val = ll['stride'] if pooling_enabled[sequence]: # Must use the default stride when pooling, otherwise stride can be set if operator[sequence] == op.CONV2D and val != 1 \ or (device == 84 and val != 3 or val != 1): error_exit( 'Cannot set `stride` to non-default value when pooling', sequence) else: if operator[sequence] == op.CONVTRANSPOSE2D and val != 2: error_exit( 'Cannot set `stride` to non-default value for ConvTranspose2D', sequence) # Stride can be set stride[sequence] = [val, val] if 'streaming' in ll: val = ll['streaming'] try: streaming[sequence] = bool(val) except ValueError: error_exit(f'Unsupported value `{val}` for `streaming`', sequence) if 'flatten' in ll: val = ll['flatten'] try: flatten[sequence] = bool(val) except ValueError: error_exit(f'Unsupported value `{val}` for `flatten`', sequence) if 'in_sequences' in ll: if isinstance(ll['in_sequences'], list): if any([(i >= sequence) for i in ll['in_sequences']]): error_exit( '`in_sequences` cannot be greater than layer sequence', sequence) elif ll['in_sequences'] >= sequence: error_exit( '`in_sequences` cannot be greater than layer sequence', sequence) in_sequences[sequence] = ll['in_sequences'] if 'conv_groups' in ll: conv_groups[sequence] = ll['conv_groups'] if 'write_gap' in ll: write_gap[sequence] = ll['write_gap'] # Fix up values for 1D convolution or no convolution if operator[sequence] == op.CONV1D: padding[sequence][1] = 0 pool[sequence][1] = 1 pool_stride[sequence][1] = 1 stride[sequence][1] = 1 elif operator[sequence] == op.NONE: kernel_size[sequence] = [1, 1] elif operator[sequence] == op.CONVTRANSPOSE2D: stride[sequence] = [2, 2] # Check for early exit if max_conv is not None: if max_conv == 0: if output_map[sequence] is None and (len(cfg['layers']) > sequence + 1): if 'processors' in cfg['layers'][sequence + 1]: output_map[sequence] = cfg['layers'][sequence + 1]['processors'] break max_conv -= 1 sequence += 1 # Sequence specification may have holes. Contract to the used layers. for ll in range(tc.dev.MAX_LAYERS - 1, -1, -1): if processor_map[ll] is None: del processor_map[ll] del padding[ll] del pool[ll] del pool_stride[ll] del input_chan[ll] del input_dim[ll] del input_offset[ll] del output_chan[ll] del output_offset[ll] del average[ll] del activation[ll] del big_data[ll] del quantization[ll] del bias_quantization[ll] del output_shift[ll] del output_map[ll] del output_width[ll] del operator[ll] del dilation[ll] del kernel_size[ll] del stride[ll] del pooling_enabled[ll] del streaming[ll] del flatten[ll] del operands[ll] del eltwise[ll] del conv_groups[ll] del write_gap[ll] # Check all but last layer for ll in range(len(output_map) - 1): if output_width[ll] != 8: error_exit('`output_width` is not 8 for intermediate layer', ll) # Fix up default output maps if output_map[ll] is None: output_map[ll] = processor_map[ll + 1] # Check all but first layer for ll in range(1, len(input_offset)): # Fix up default input maps if input_offset[ll] is None: input_offset[ll] = output_offset[ll - 1] # Check we don't turn on streaming too late if streaming[ll] and not streaming[ll - 1]: error_exit('Enable streaming from the first layer on', ll) # Check first layer if input_offset[0] is None: input_offset[0] = 0 # Check last layer if output_map[-1] is None and 'output_map' in cfg: output_map[-1] = cfg['output_map'] if output_width[-1] != 8 and activation[-1] is not None: error_exit('`output_width` must be 8 when activation is used', len(activation)) # Check all layers for ll, e in enumerate(operator): # Check that pass-through does not use activation if e == op.NONE: if activation[ll] is not None: error_exit('Pass-through layers must not use activation', ll) if padding[ll][0] != 0 or padding[ll][1] != 0: error_exit('Padding must be zero for passthrough layers', ll) # Check that pooling isn't set for ConvTranspose2d: elif e == op.CONVTRANSPOSE2D: if pooling_enabled[ll]: error_exit('ConvTranspose2d cannot be used with pooling', ll) # Check that element-wise does not use Conv1d if e == op.CONV1D and operands[ll] > 1: error_exit( 'Element-wise operations cannot be combined with Conv1d', ll) if not pool_first[ll] and (operands[ll] == 1 or pool[ll][0] == 1 and pool[ll][1] == 1): error_exit( '`pool_first: False` requires both pooling and element-wise operations', ll) if device == 84: # Fix up defaults for Conv1D: for ll, e in enumerate(operator): if e == op.CONV1D: kernel_size[ll] = [9, 1] settings = {} settings['padding'] = padding settings['pool'] = pool settings['pooling_enabled'] = pooling_enabled settings['pool_stride'] = pool_stride settings['input_chan'] = input_chan settings['input_dim'] = input_dim settings['input_offset'] = input_offset settings['output_chan'] = output_chan settings['output_offset'] = output_offset settings['processor_map'] = processor_map settings['average'] = average settings['activation'] = activation settings['big_data'] = big_data settings['quantization'] = quantization settings['bias_quantization'] = bias_quantization settings['output_shift'] = output_shift settings['output_processor_map'] = output_map settings['output_width'] = output_width settings['operator'] = operator settings['dilation'] = dilation settings['kernel_size'] = kernel_size settings['stride'] = stride settings['streaming'] = streaming settings['flatten'] = flatten settings['operands'] = operands settings['eltwise'] = eltwise settings['pool_first'] = pool_first settings['in_sequences'] = in_sequences settings['conv_groups'] = conv_groups settings['write_gap'] = write_gap return cfg, len(processor_map), settings
def load( checkpoint_file, unused_arch, fc_layer, quantization, bias_quantization, output_shift, kernel_size, # this information available in onnx model operator, verbose=False, no_bias=None, ): """ Load weights and biases from `checkpoint_file`. If `arch` is not None and does not match the architecuture in the checkpoint file, abort with an error message. If `fc_layer` is `True`, configure a single fully connected classification layer for software rather than hardware. `quantization` is a list of expected bit widths for the layer weights (always 8 for AI84). This value is checked against the weight inputs. `bias_quantization` is a list of the expected bit widths for the layer weights (always 8 for AI84/AI85). In addition to returning weights anf biases, this function configures the network output channels and the number of layers. When `verbose` is set, display the shapes of the weights. """ model = onnx.load(checkpoint_file) print(f'Reading {checkpoint_file} to configure network weights...') layers = 0 num_conv_layers = len(quantization) no_bias = no_bias or [] weights = [] bias = [] fc_weights = [] fc_bias = [] weight_keys = [] bias_keys = [] output_channels = [] input_channels = [] param_count = 0 param_size = 0 error_exit = False quant = [] bias_quant = [] weight_min = [] weight_max = [] weight_size = [] bias_min = [] bias_max = [] bias_size = [] seq = 0 kernel_size_onnx = [] initializers = {t.name for t in model.graph.initializer} for _, node in enumerate(model.graph.node): if node.op_type == 'Conv' or node.op_type == 'Gemm': _inputs, _outputs = get_inouts(node) for _input in _inputs: w = process_channels(model, _input, initializers) if w is not None: if node.op_type == 'Gemm': # general matrix multiplication (FC layer) kernel_shape = [1, 1] kernel_size_onnx.append(kernel_shape) if layers >= num_conv_layers: continue if fc_layer: if _input == _inputs[1]: # weight assert w.min() >= -128 and w.max() <= 127 fc_weights.append(w) if len(_inputs) == 3: # have optional bias input if _input == _inputs[2]: # bias assert w.min() >= -128 and w.max() <= 127 fc_bias.append(w) elif _input == _inputs[1]: # add bias 'None' fc_bias.append( None) # during weight input processing if node.op_type == 'Conv': # (Conv layer) for a in node.attribute: if a.name == 'kernel_shape': kernel_size_onnx.append(a.ints) if len(w.shape) > 1: # not a bias quant.append(quantization[seq]) w_min, w_max = w.min(), w.max() # Determine quantization or make sure that what was given fits if quantization[seq] is not None: assert w_min >= -(2**(quantization[seq] - 1)), print(w_min) assert w_max < 2**(quantization[seq] - 1), print(w_max) else: if w_max > 0: w_max_m = int(w_max) else: w_max_m = int(abs(w_max)) - 1 if w_min > 0: w_min_m = int(w_min) else: w_min_m = int(abs(w_min)) - 1 quantization[seq] = 1 << ( fls(max(fls(w_max_m), fls(w_min_m)) + 1) + 1) assert quantization[seq] <= 8 weight_min.append(w_min) weight_max.append(w_max) # Not overriding output_shift? if output_shift[seq] is None: output_shift[seq] = 0 # Add based on quantization output_shift[seq] += 8 - quantization[seq] # TODO: Double check if we need to check conv2d if opn is known # to be opn.CONVTRANSPOSE2D. We should be able to get this # from the op_type Conv plus shape? if operator[seq] == opn.CONVTRANSPOSE2D: # For ConvTranspose2d, flip the weights as follows: w = np.flip(w, axis=(2, 3)).swapaxes(0, 1) input_channels.append(w.shape[1]) # Input channels output_channels.append(w.shape[0]) # Output channels if len(w.shape) == 2: # MLP if kernel_size_onnx[seq][ 0] != 1 or kernel_size_onnx[seq][1] != 1: eprint( f'The `kernel_size` for the MLP layer {seq} should ' f'be set to 1x1 instead of ' f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.' ) error_exit = True elif len(w.shape) == 3: # 1D if kernel_size_onnx[seq][0] != w.shape[2] \ or kernel_size_onnx[seq][1] != 1: eprint( f'The `kernel_size` for the 1D layer {seq} should ' f'be set to {w.shape[2]}x1 instead of ' f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.' ) error_exit = True elif len(w.shape) == 4: # 2D if kernel_size_onnx[seq][0] != w.shape[2] \ or kernel_size_onnx[seq][1] != w.shape[3]: eprint( f'The `kernel_size` for the 2D layer {seq} should ' f'be set to {w.shape[2]}x{w.shape[3]} instead of ' f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.' ) error_exit = True w_count = np.prod(w.shape) param_count += w_count w_size = (w_count * quantization[seq] + 7) // 8 weight_size.append(w_size) param_size += w_size if len(w.shape) == 2: # linear - add dummy 'channel' w = np.expand_dims(w, axis=0) else: # conv1d, conv2d, ... - combine input and output channels w = np.reshape(w, (-1, ) + w.shape[2:]) weights.append(w) weight_keys.append(_input) if len(_inputs) < 3 or \ (_input == _inputs[2] and seq in no_bias): # no bias input bias.append(None) bias_min.append(0) bias_max.append(0) bias_keys.append('N/A') bias_quant.append(0) bias_size.append(0) elif _input == _inputs[2]: # bias input w = w // tornadocnn.dev.BIAS_DIV w_min, w_max = w.min(), w.max() assert w_min >= -(2**(bias_quantization[seq] - 1)) assert w_max < 2**(bias_quantization[seq] - 1) bias_min.append(w_min) bias_max.append(w_max) bias.append(w) bias_keys.append(_input) bias_quant.append(bias_quantization[seq]) w_count = np.prod(w.shape) param_count += w_count w_size = (w_count * 8 + (bias_quantization[seq] - 1)) // bias_quantization[seq] bias_size.append(w_size) param_size += w_size seq += 1 layers += 1 # TODO: Things to add # if attribute.name == 'pads': # if attribute.name == 'strides': if verbose: print( 'Layer InCh OutCh Weights Quant Min Max Size ' 'Key Bias Quant Min Max Size Key' ) for ll in range(layers): if ll < len(weights) and weights[ll] is not None: weight_shape = str(weights[ll].shape) if bias[ll] is not None: bias_shape = str(bias[ll].shape) else: bias_shape = 'N/A' print( f'{ll:4}: ' f'{input_channels[ll]:5} {output_channels[ll]:5} ' f'{weight_shape:15} ' f'{quant[ll]:5} {weight_min[ll]:4} {weight_max[ll]:3} {weight_size[ll]:6} ' f'{weight_keys[ll]:35} ' f'{bias_shape:10} ' f'{bias_quant[ll]:5} {bias_min[ll]:4} {bias_max[ll]:3} {bias_size[ll]:4} ' f'{bias_keys[ll]:25}') print( f'TOTAL: {layers} layers, {param_count:,} parameters, {param_size:,} bytes' ) if error_exit: sys.exit(1) if verbose: with np.printoptions(threshold=np.inf, linewidth=80): print("\nSUMMARY\n=======") print(layers, "layers\n") print("weights:") print(weights) print("bias:") print(bias) print("fc_weights:") print(fc_weights) print("fc_bias:") print(fc_bias) print("input_channels:") print(input_channels) print("output_channels:") print(output_channels) print("") return layers, weights, bias, output_shift, \ fc_weights, fc_bias, input_channels, output_channels
def extract_funct_alt(funct_lines, funct_name, starting_line_num): """ Constructs a cloned function from an array of code lines. """ start_line_num = 0 call_list = ["call", "callf", "callq"] returns = ["ret", "retf", "iret", "retq", "iretq"] jmp_list = [ "jo", "jno", "jb", "jnae", "jc", "jnb", "jae", "jnc", "jz", "je", "jnz", "jne", "jbe", "jna", "jnbe", "ja", "js", "jns", "jp", "jpe", "jnp", "jpo", "jl", "jnge", "jnl", "jge", "jle", "jng", "jnle", "jg", "jecxz", "jrcxz", "jmp", "jmpe" ] CALL_SITE, RETURN_SITE, INDIR_JMP_SITE, PLT_SITE, = 0, 1, 2, 3 comment_continues = False sites = [] direct_call_sites = [] empty_ret_dict = dict() line_num = starting_line_num for asm_line in funct_lines: asm_parsing.update_dwarf_loc(asm_line, dwarf_loc) try: first_word = asm_line.split()[0] except IndexError: # ignore empty line asm_line = asm_file.readline() line_num += 1 continue if first_word[:len('.LFE')] == '.LFE': break else: targets = [] labels, key_symbol, arg_str, comment_continues = ( asm_parsing.decode_line(asm_line, comment_continues)) if key_symbol in call_list: new_site = funct_cfg.Site(line_num, targets, CALL_SITE, dwarf_loc) if '%' not in arg_str: new_site.targets.append(arg_str) direct_call_sites.append(new_site) sites.append(new_site) elif key_symbol in returns: # empty return dict passed so that every site's return dict is # a reference to the function's return dict sites.append( funct_cfg.Site(line_num, empty_ret_dict, RETURN_SITE, dwarf_loc)) elif key_symbol in jmp_list: if '%' in arg_str: sites.append( funct_cfg.Site(line_num, targets, INDIR_JMP_SITE, dwarf_loc)) line_num += 1 else: eprint( dwarf_loc.filename() + ':' + ' ' + ':' + str(start_line_num) + ' error: unterminated function: ', funct_name) src_filename = dwarf_loc.filename() new_funct = funct_cfg.Function(funct_name, ' ', src_filename, sites, starting_line_num) new_funct.direct_call_sites = direct_call_sites new_funct.ret_dict = empty_ret_dict return new_funct, line_num
def filename(self): if self.filenum not in self._filename_dict: eprint('warning: undefined filenumber: ' + str(self.filenum)) return '?' return self._filename_dict[self.filenum]
def generateDicts(sock): severityMap = { "0": "emerg", "1": "alert", "2": "crit", "3": "err", "4": "warning", "5": "notice", "6": "info", "7": "debug" } facilityMap = { "0": "kernel", "1": "user", "2": "mail", "3": "system", "4": "auth", "5": "syslog", "6": "lpd", "7": "news", "8": "uucp", "9": "time", "10": "auth", "11": "ftp", "12": "ntp", "13": "logaudit", "14": "logalert", "15": "clock", "16": "local0", "17": "local1", "18": "local2", "19": "local3", "20": "local4", "21": "local5", "22": "local6", "23": "local7" } skip = 0 skipcount = 0 ssec = datetime.utcnow().strftime("%S") yieldcount = 0 # Compile regex patterns for iteration on each component of the message pats = {} pristrings = [r'^<(?P<pri>\d{1,3})>(\d*:?)?'] pats['pri'] = [] for i in pristrings: pats['pri'].append(re.compile(i + r'(?P<space>\s?)\S+')) # Date/time datestrings = [ r'(?P<date>[A-Za-z]+ [ \d]?\d \d\d:\d\d:\d\d( [A-Z]{3}:)?)', r'(?P<date>\d{4} [A-Za-z]+ [ \d]?\d \d\d:\d\d:\d\d( [A-Z]{3}:)?)', r'(?P<date>\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d\.\d{3}Z)' ] pats['date'] = [] for i in datestrings: pats['date'].append(re.compile(i + r'(?P<space>\s+)\S+')) # Host/IP hoststrings = [ r'(?P<host>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', r'(?P<host>[a-z0-9_-]+(\.[a-z0-9_-]+)*(\.[a-z]+[0-9]?))', r'(?P<host>[a-z0-9_-]+)' ] pats['host'] = [] for i in hoststrings: pats['host'].append(re.compile(i + r'(?P<space>\s+)\S+', re.IGNORECASE)) # Rest of line pats['message'] = [re.compile(r'(?P<message>.*)(?P<space>\s*)$')] currentDict = {} a10 = Device.A10('logs') arista = Device.Arista('logs') brocade = Device.Brocade('logs') f5 = Device.F5('logs') force10 = Device.Force10('logs') juniper = Device.Juniper('logs') linux = Device.Linux('logs') pktbuf = [] pktbuf_peak = 0 packets = 0 processed = 0 while True: try: data, (src_ip, port) = sock.recvfrom(8192) pktbuf.append((data, src_ip, port)) packets += 1 except socket.error: if int(datetime.utcnow().strftime("%S")) != ssec: ssec = int(datetime.utcnow().strftime("%S")) eprint( "%28s Messages in buffer (peak): %d read: %d processed: %d yielded: %d skipped: %d" % (str(datetime.utcnow()), pktbuf_peak, packets, processed, yieldcount, skipcount)) pktbuf_peak = 0 if len(pktbuf) == 0: continue else: buf_len = len(pktbuf) if buf_len > pktbuf_peak: pktbuf_peak = buf_len while pktbuf: line, src_ip, port = pktbuf.pop(0) processed += 1 # We can safely init the dict here because multiline messages are # still contained within single datagrams currentDict = {} # Pristine copy of what we received currentDict['raw_message'] = line # Strip any leading junk if line[0] == '\0': line = line.lstrip('\r\n\0 ') for pname in ['pri', 'date', 'host', 'message']: for p in pats[pname]: matched = p.match(line) if matched: if pname == 'pri': currentDict['severity_int'] = str( int(matched.group('pri')) & 7) currentDict['facility_int'] = str( int(matched.group('pri')) >> 3 & 23) currentDict['severity_label'] = severityMap[ currentDict['severity_int']] currentDict['facility_label'] = facilityMap[ currentDict['facility_int']] currentDict[pname] = matched.group(pname) # Trim the line up to the ending space from the last match line = line[matched.end('space'):] # We matched this element so no need to keep looping on it break # None of the patterns matched for this field if pname not in currentDict: eprint("Did not match for %s: %s" % (pname, line)) # Chop off any remaining crap line = line.rstrip() # Finished parsing but did not consume the whole line (should never happen) if len(line) > 0: eprint("still some line left: [%s]" % line) # Did not match anything at all? if currentDict == {}: eprint("matched nothing: [%s]" % line) skipcount += 1 continue else: skip = 0 vendor = None #currentDict['fromhost'] = resolveHostname(src_ip) currentDict['fromhost'] = src_ip currentDict['fromhost-ip'] = src_ip if 'host' not in currentDict: currentDict['host'] = currentDict['fromhost'].lower() else: currentDict['host'] = currentDict['host'].lower() try: if currentDict['host'].find('v-') >= 0 and currentDict[ 'host'].find('-net') >= 7: vendor = linux elif currentDict['host'].find( 'bar') == 0 or currentDict['host'].find( 'bcr') == 0 or currentDict['host'].find( 'scr' ) == 0 or currentDict['host'].find( 'sff' ) == 0 or currentDict['host'].find( 'mfw' ) == 0 or currentDict['host'].find( 're') == 0 or currentDict['host'].find( 'bmr' ) == 0 or currentDict['host'].find( 'fw' ) == 0 or currentDict['host'].find( 'r1') == 0 or currentDict[ 'host'].find('r2') == 0: vendor = juniper elif currentDict['host'].find( 'ma') == 0 or currentDict['host'].find( 'trr') == 0 or currentDict['host'].find( 'spr' ) == 0 or currentDict['host'].find( 'ssr') == 0 or currentDict[ 'host'].find('ser') == 0: vendor = arista elif currentDict['host'].find( 'slb') == 0 or currentDict['host'].find( 'mlb') == 0 or currentDict['host'].find( 'glb') == 0 or currentDict[ 'host'].find('vpr') == 0: vendor = a10 elif currentDict['host'].find('lb') == 0: vendor = f5 elif currentDict['host'].find('sw') == 0: vendor = brocade elif currentDict['host'].find('10.1') == 0: vendor = force10 if vendor: currentDict['vendor'] = vendor.vendor if not vendor.matchLogPattern(currentDict): eprint( "Did not match %s message for host %s: %s" % (vendor.vendor, currentDict['host'], currentDict['message'])) # Flag as unmatched message currentDict['state'] = 5 else: eprint( "Did not match host pattern for host: %s message: %s" % (currentDict['host'], currentDict['message'])) except KeyError: eprint("Field not found:", currentDict) skip = 1 skipcount += 1 if skip == 0: yield (currentDict) yieldcount += 1
def load( checkpoint_file, arch, fc_layer, quantization, bias_quantization, output_shift, kernel_size, operator, verbose=False, no_bias=None, conv_groups=None, ): """ Load weights and biases from `checkpoint_file`. If `arch` is not None and does not match the architecuture in the checkpoint file, abort with an error message. If `fc_layer` is `True`, configure a single fully connected classification layer for software rather than hardware. `quantization` is a list of expected bit widths for the layer weights (always 8 for AI84). This value is checked against the weight inputs. `bias_quantization` is a list of the expected bit widths for the layer weights (always 8 for AI84/AI85). In addition to returning weights anf biases, this function configures the network output channels and the number of layers. When `verbose` is set, display the shapes of the weights. """ no_bias = no_bias or [] weights = [] bias = [] fc_weights = [] fc_bias = [] weight_keys = [] bias_keys = [] quant = [] bias_quant = [] weight_min = [] weight_max = [] weight_size = [] bias_min = [] bias_max = [] bias_size = [] checkpoint = torch.load(checkpoint_file, map_location='cpu') print(f'Reading {checkpoint_file} to configure network weights...') if 'state_dict' not in checkpoint or 'arch' not in checkpoint: raise RuntimeError("\nNo `state_dict` or `arch` in checkpoint file.") if arch and checkpoint['arch'].lower() != arch.lower(): eprint( f"Network architecture of configuration file ({arch}) does not match " f"network architecture of checkpoint file ({checkpoint['arch']}).") sys.exit(1) checkpoint_state = checkpoint['state_dict'] layers = 0 num_conv_layers = len(quantization) have_fc_layer = False output_channels = [] input_channels = [] param_count = 0 param_size = 0 error_exit = False seq = 0 for _, k in enumerate(checkpoint_state.keys()): # Skip over non-weight layers while seq < len(operator) and operator[seq] == opn.NONE: seq += 1 operation, parameter = k.rsplit(sep='.', maxsplit=1) if parameter in ['weight']: module, op = k.split(sep='.', maxsplit=1) op = op.rsplit(sep='.', maxsplit=1)[0] if module != 'fc' or module == 'fc' and not fc_layer: if layers >= num_conv_layers or seq >= num_conv_layers: continue w = checkpoint_state[k].numpy().astype(np.int64) w_min, w_max = w.min(), w.max() # Determine quantization or make sure that what was given fits if quantization[seq] is not None: assert w_min >= -(2**(quantization[seq] - 1)) assert w_max < 2**(quantization[seq] - 1) else: if w_max > 0: w_max_m = int(w_max) else: w_max_m = int(abs(w_max)) - 1 if w_min > 0: w_min_m = int(w_min) else: w_min_m = int(abs(w_min)) - 1 quantization[seq] = 1 << ( fls(max(fls(w_max_m), fls(w_min_m)) + 1) + 1) assert quantization[seq] <= 8 quant.append(quantization[seq]) weight_min.append(w_min) weight_max.append(w_max) if op == 'conv2d' and operator[seq] == opn.CONVTRANSPOSE2D: # For ConvTranspose2d, flip the weights as follows: w = np.flip(w, axis=(2, 3)).swapaxes(0, 1) mult = conv_groups[ seq] if operator[seq] != opn.CONVTRANSPOSE2D else 1 input_channels.append(w.shape[1] * mult) # Input channels mult = conv_groups[seq] if operator[ seq] == opn.CONVTRANSPOSE2D else 1 output_channels.append(w.shape[0] * mult) # Output channels if len(w.shape) == 2: # MLP if kernel_size[seq][0] != 1 or kernel_size[seq][1] != 1: eprint( f'The `kernel_size` for the MLP layer {seq} should ' f'be set to 1x1 instead of ' f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.') error_exit = True elif len(w.shape) == 3: # 1D if kernel_size[seq][0] != w.shape[2] or kernel_size[seq][ 1] != 1: eprint( f'The `kernel_size` for the 1D layer {seq} should ' f'be set to {w.shape[2]}x1 instead of ' f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.') error_exit = True elif len(w.shape) == 4: # 2D if kernel_size[seq][0] != w.shape[2] \ or kernel_size[seq][1] != w.shape[3]: eprint( f'The `kernel_size` for the 2D layer {seq} should ' f'be set to {w.shape[2]}x{w.shape[3]} instead of ' f'{kernel_size[seq][0]}x{kernel_size[seq][1]}.') error_exit = True w_count = np.prod(w.shape) param_count += w_count w_size = (w_count * quantization[seq] + 7) // 8 weight_size.append(w_size) param_size += w_size if len(w.shape) == 2: # linear - add dummy 'channel' w = np.expand_dims(w, axis=0) else: # conv1d, conv2d, ... - combine input and output channels w = np.reshape(w, (-1, ) + w.shape[2:]) weights.append(w) weight_keys.append(k) # Is there a bias for this layer? bias_name = operation + '.bias' if bias_name in checkpoint_state and seq not in no_bias: w = checkpoint_state[bias_name].numpy(). \ astype(np.int64) // tornadocnn.dev.BIAS_DIV w_min, w_max = w.min(), w.max() assert w_min >= -(2**(bias_quantization[seq] - 1)) assert w_max < 2**(bias_quantization[seq] - 1) bias_min.append(w_min) bias_max.append(w_max) bias.append(w) bias_keys.append(bias_name) bias_quant.append(bias_quantization[seq]) w_count = np.prod(w.shape) param_count += w_count w_size = ( w_count * 8 + (bias_quantization[seq] - 1)) // bias_quantization[seq] bias_size.append(w_size) param_size += w_size else: bias.append(None) bias_min.append(0) bias_max.append(0) bias_keys.append('N/A') bias_quant.append(0) bias_size.append(0) # Not overriding output_shift? if output_shift[seq] is None: output_shift_name = operation.rsplit( sep='.', maxsplit=1)[0] + '.output_shift' # Is there an output_shift for this layer? if output_shift_name in checkpoint_state: w = checkpoint_state[output_shift_name].numpy().astype( np.int64) assert len(w) == 1 output_shift[seq] = w[0] else: output_shift[seq] = 0 # Add implicit shift based on quantization output_shift[seq] += 8 - quantization[seq] layers += 1 seq += 1 elif have_fc_layer: eprint( 'The network cannot have more than one fully connected software layer, ' 'and it must be the output layer.') sys.exit(1) elif fc_layer: w = checkpoint_state[k].numpy().astype(np.int64) assert w.min() >= -128 and w.max() <= 127 fc_weights.append(w) # Is there a bias for this layer? bias_name = operation + '.bias' if bias_name in checkpoint_state: # Do not divide bias for FC w = checkpoint_state[bias_name].numpy().astype(np.int64) assert w.min() >= -128 and w.max() <= 127 fc_bias.append(w) else: fc_bias.append(None) have_fc_layer = True if verbose: print( f'Checkpoint for epoch {checkpoint["epoch"]}, model {checkpoint["arch"]} - ' 'weight and bias data:') print( 'Layer InCh OutCh Weights Quant Shift Min Max Size ' 'Key Bias Quant Min Max Size Key' ) for ll in range(layers): if ll < len(weights) and weights[ll] is not None: weight_shape = str(weights[ll].shape) if bias[ll] is not None: bias_shape = str(bias[ll].shape) else: bias_shape = 'N/A' if output_shift[ll] is not None: output_shift_shape = output_shift[ll] else: output_shift_shape = 'N/A' print( f'{ll:4}: ' f'{input_channels[ll]:5} {output_channels[ll]:5} ' f'{weight_shape:15} ' f'{quant[ll]:5} {output_shift_shape:5} ' f'{weight_min[ll]:4} {weight_max[ll]:3} {weight_size[ll]:6} ' f'{weight_keys[ll]:35} ' f'{bias_shape:10} ' f'{bias_quant[ll]:5} {bias_min[ll]:4} {bias_max[ll]:3} {bias_size[ll]:4} ' f'{bias_keys[ll]:25}') print( f'TOTAL: {layers} layers, {param_count:,} parameters, {param_size:,} bytes' ) if error_exit: sys.exit(1) return layers, weights, bias, output_shift, \ fc_weights, fc_bias, input_channels, output_channels
def pool2d( data, input_size, output_size, pool, stride, average, floor=True, debug=False, ): """ Compute 2D Pooling (Average or Max) """ assert data.shape == tuple(input_size) if debug: # Slow using pure Python ref = np.empty(shape=output_size, dtype=np.int64) for c in range(input_size[0]): for row in range(0, output_size[1]*stride[0], stride[0]): for col in range(0, output_size[2]*stride[1], stride[1]): if average: avg = np.average(data[c][row:row+pool[0], col:col+pool[1]]) if floor: if avg < 0: val = np.ceil(avg).astype(np.int64).clip(min=-128, max=127) else: val = np.floor(avg).astype(np.int64).clip(min=-128, max=127) else: val = np.floor(avg + 0.5).astype(np.int64).clip(min=-128, max=127) else: val = np.amax(data[c][row:row+pool[0], col:col+pool[1]]) ref[c][row//stride[0]][col//stride[1]] = val # Fast computation using NumPy data_pad = data[:, :(data.shape[1] - pool[0]) // stride[0] * stride[0] + pool[0], :(data.shape[2] - pool[1]) // stride[1] * stride[1] + pool[1], ...] h, w = data_pad.strides[1:] view = as_strided(data_pad, shape=(data_pad.shape[0], 1 + (data_pad.shape[1]-pool[0]) // stride[0], 1 + (data_pad.shape[2]-pool[1]) // stride[1], pool[0], pool[1]), strides=(data_pad.strides[0], stride[0] * h, stride[1] * w, h, w), writeable=False) if average: if floor: pooled = np.nanmean(view, dtype=np.int64, axis=(3, 4)) else: pooled = np.round(np.nanmean(view, axis=(3, 4))).astype(np.int64) else: pooled = np.nanmax(view, axis=(3, 4)) if debug: match = (ref == pooled).all() if not match: eprint('NumPy <-> Python mismatch in compute.pool2d') sys.exit(1) assert pooled.shape == tuple(output_size) return pooled
def print_fptr_site_unmatched_msg(): eprint(funct.src_filename + ':' + str(fptr_sites[j].src_line_num) + ':' + funct.asm_filename + ':' + str(fptr_sites[j].asm_line_num) + ': warning: no type for indirect call site in function ' 'named \'' + funct.asm_name + '\'') # fix for C++
def verify( verify_fn, ll, in_map, out_map, out_buf, processor_map, input_shape, out_offset, out_expand, out_expand_thresh, output_width=8, pool=None, pool_stride=None, overwrite_ok=False, no_error_stop=False, device=84, mlator=False, apb_base=0, stream=None, max_count=None, write_gap=0, ): """ Verify HWC memory from AI8X, writing C or mem code using the `verify_fn` function. The generated code is specific to the network configuration passed in in `processor_map`, and `input_shape`. Additionally, the generated addresses are offset by `out_offset`. The function takes a pointer to a memory array, and the depth of the array does not matter (flattened or not flattened) as long as the size is correct. `in_map` and `out_map` are used to optionally prevent overwriting data (controlled by `overwrite_ok` and `no_error_stop`). When `mlator` is set, use the hardware mechanism to rearrange 4-channel data into single channels. """ count = 0 def check_overwrite( p, target_offs, in_map, out_map, c, row, col, ): # If using single layer, make sure we're not overwriting the input if (not overwrite_ok) and in_map[target_offs >> 2] is not None: old_ll, old_c, old_row, old_col, _ = in_map[target_offs >> 2] eprint( f'Processor {p}: ' f'Layer {ll} output for CHW={c},{row},{col} is overwriting ' f'input at offset 0x{target_offs:08x} that was created by ' f'layer {old_ll}, CHW={old_c},{old_row},{old_col}.', error=not no_error_stop) if not no_error_stop: sys.exit(1) # Check we're not overflowing the data memory if (not overwrite_ok) and out_map is not None and out_map[ target_offs >> 2] is not None: old_ll, old_c, old_row, old_col, old_val = out_map[target_offs >> 2] eprint( f'Processor {p}: ' f'Layer {ll} output for CHW={c},{row},{col} is overwriting ' f'offset 0x{target_offs:08x}. Previous write by ' f'layer {old_ll},CHW={old_c},{old_row},{old_col} with value 0x{old_val:08x}.', error=not no_error_stop) if not no_error_stop: sys.exit(1) # Start at the instance of the first active output processor/channel coffs_start = ffs(processor_map) & ~(tc.dev.P_SHARED - 1) next_layer_map = processor_map >> coffs_start # Output expansion for channels and/or wide output out_size = output_width // 8 width = out_expand * out_size if not mlator or out_size > 1: if mlator: eprint('ignoring --mlator for 32-bit output', error=False) for doffs in range(input_shape[1] * input_shape[2]): row, col = divmod(doffs, input_shape[2]) this_map = next_layer_map coffs = coffs_start poffs = coffs_start c = 0 while c < input_shape[0]: if c % out_expand_thresh == 0: poffs = coffs_start this_map = next_layer_map # Wrap around for AI85 channel expansion this_c = c expand = c // out_expand_thresh # Channels 64+ handled by processors 0+ # Physical offset into instance and group proc = poffs & ~(tc.dev.P_SHARED - 1) # Get four bytes or words either from output or zeros and construct HWC word no_data = True if out_size == 1: val = 0 for _ in range(4): val >>= 8 if this_map & 1: no_data = False if c < input_shape[0]: val |= (out_buf[c][row][col] & 0xff) << 24 c += 1 this_map >>= 1 else: val = [0] * 4 for i in range(4): if this_map & 1: no_data = False if c < input_shape[0]: val[i] = out_buf[c][row][col] & 0xffffffff c += 1 this_map >>= 1 # Get the offset of the first output byte/word of 4 offs = tc.dev.C_SRAM_BASE + out_offset - (write_gap << 2) + \ (((proc % tc.dev.P_NUMPRO) * tc.dev.INSTANCE_SIZE | (proc // tc.dev.P_NUMPRO) * tc.dev.C_GROUP_OFFS // 4) + (doffs * (write_gap + 1)) * width + expand * out_size) * 4 # Special adjustment for AI84 quirk if device == 84 and pool and pool[0] == 4 and pool_stride[ 0] == 4: offs += (doffs // 4) * 8 + 8 if not no_data: num_bytes = min(c - this_c, input_shape[0] - this_c) if out_size == 1: check_overwrite( proc, offs, in_map, out_map, this_c, row, col, ) if out_map is not None: out_map[offs >> 2] = (ll, this_c, row, col, val) if max_count is None or count < max_count: verify_fn( offs, val, rv=False, comment= f' // {row},{col},{this_c}-{this_c+num_bytes-1}', num_bytes=num_bytes, first_proc=ffs(next_layer_map >> proc) % 4, ) else: for i in range(min(num_bytes, out_size)): check_overwrite( proc, offs, in_map, out_map, this_c, row, col, ) if out_map is not None: out_map[offs >> 2] = (ll, this_c, row, col, val[i]) if max_count is None or count < max_count: verify_fn( offs, val[i], rv=False, comment=f' // {row},{col},{this_c+i}', ) offs += out_size count += 1 if count == max_count: stream.write(' // Truncated further checks...\n') coffs += 4 poffs += 4 else: # mlator == True assert out_size == 1 c = 0 poffs = coffs_start this_map = next_layer_map read_addr = None while c < input_shape[0]: if c % out_expand_thresh == 0: poffs = coffs_start # Wrap around for AI85 channel expansion this_map = next_layer_map expand = c // out_expand_thresh # Channels 64+ handled by processors 0+ # Physical offset into instance and group proc = poffs & ~(tc.dev.P_SHARED - 1) addr = tc.dev.C_CNN_BASE + (proc // tc.dev.P_NUMPRO) * tc.dev.C_GROUP_OFFS mlat = addr + tc.dev.REG_MLAT * 4 ctrl = addr + tc.dev.REG_CTL * 4 for shift in range(4): if this_map & 1: for doffs in range(0, input_shape[1] * input_shape[2], 4): row, col = divmod(doffs, input_shape[2]) # Get four bytes or words either from output or zeros and # construct HWC word val = 0 for i in range(4): val >>= 8 if col + i < input_shape[2]: val |= (out_buf[c][row][col + i] & 0xff) << 24 # Get the offset of the first output byte/word of 4 source = out_offset + \ (((proc % tc.dev.P_NUMPRO) * tc.dev.INSTANCE_SIZE | (proc // tc.dev.P_NUMPRO) * tc.dev.C_GROUP_OFFS // 4) + (doffs >> 2) * width) * 4 if source != read_addr: if doffs != 0: stream.write( f' *((volatile uint32_t *) ' f'0x{apb_base + ctrl:08x}) = ' f'0x{tc.dev.READY_SEL << 1 | 1 << 3:08x}; ' '// Disable mlator\n') # Set wptr to start address w = apb_base + addr + tc.dev.C_CNN*4 \ + tc.dev.LREG_WPTR_BASE*4 * tc.dev.MAX_LAYERS stream.write( f' *((volatile uint32_t *) 0x{w:08x}) = ' f'0x{source >> 2:08x}; // Set SRAM address\n') # Set wptr_inc to set increment value (default: 1) w = apb_base + addr + tc.dev.C_CNN*4 \ + tc.dev.LREG_LCTL2*4 * tc.dev.MAX_LAYERS stream.write( f' *((volatile uint32_t *) 0x{w:08x}) = ' f'0x{expand:08x}; // Set pointer increment\n') # Set mlatorld enable bit to load write ptr; select byte 0..3 w = tc.dev.READY_SEL << 1 | 1 << 16 | shift << 17 | 1 << 3 stream.write( f' *((volatile uint32_t *) 0x{apb_base + ctrl:08x}) =' f' 0x{w:08x}; ' f'// Enable mlator, byte {shift}\n') stream.write( ' asm volatile ("" : "=m" (*((volatile uint32_t *) ' f'0x{apb_base + mlat:08x})) : "r" ' f'(*((volatile uint32_t *) 0x{apb_base + mlat:08x})));' ' // Prime\n') num_bytes = min(4, input_shape[2] - col) check_overwrite( proc, tc.dev.C_SRAM_BASE + source, in_map, out_map, c, row, col, ) if out_map is not None: out_map[source >> 2] = (ll, c, row, col, val) verify_fn( mlat, val, rv=False, comment=f' // {row},{col}-{col+num_bytes-1},{c}', num_bytes=num_bytes, ) read_addr = source + 4 # Disable mlator stream.write(f' *((volatile uint32_t *) ' f'0x{apb_base + ctrl:08x}) = ' f'0x{tc.dev.READY_SEL << 1 | 1 << 3:08x}; ' '// Disable mlator\n') this_map >>= 1 c += 1 poffs += 4