Beispiel #1
0
def stream(cfg,
           classes_file,
           weights,
           socket_ip,
           socket_port,
           image_size=128,
           confidence_threshold=0.6,
           nms_thres=0.5):
    print('+ Initializing model')
    model = Darknet(cfg, image_size)
    print('+ Loading model')
    load_darknet_weights(model, weights)
    print('+ Fusing model')
    model.fuse()
    print('+ Loading model to CPU')
    model.to('cpu').eval()
    print('+ Loading webcam')
    cap = LoadKinect(img_size=image_size)
    print('+ Loading classes')
    classes = load_classes(classes_file)
    colors = [[random.randint(0, 255) for _ in range(3)]
              for _ in range(len(classes))]
    print('+ Connecting to remote socket')
    global sock
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.connect((socket_ip, socket_port))
    print('+ Enumerating cam')
    for counter, (path, img, im0, vid_cap) in enumerate(cap):
        t = time.time()

        print('+ Loading image to CPU')
        img = torch.from_numpy(img).unsqueeze(0).to('cpu')
        pred, _ = model(img)
        print('+ Detecting objects')
        det = non_max_suppression(pred, confidence_threshold, nms_thres)[0]

        if det is not None and len(det) > 0:
            detected_classes = []
            print('+ Rescaling model')
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4],
                                      im0.shape).round()

            print('+ Reading depth')

            depth = get_depth()
            depth_swap = np.swapaxes(depth, 0, 1)

            depth_strip1d = np.array([
                np.sort(stripe)[100] for stripe in depth_swap
            ]).astype(np.uint8)
            depth_strip2d_swap = np.array([
                np.ones(depth_swap.shape[1]) * depth for depth in depth_strip1d
            ]).astype(np.uint8)
            depth_strip2d = np.swapaxes(depth_strip2d_swap, 0, 1)

            depth_edge1d = np.zeros(depth_strip1d.shape)

            state = False
            for counter, _ in np.ndenumerate(depth_edge1d[:-1]):
                state = True if not state and depth_strip1d[
                    counter] < 230 else False
                depth_edge1d[counter[0]] = not state

            state = False
            state_cnt = 0
            for counter, _ in np.ndenumerate(depth_edge1d[:-1]):
                counter = counter[0]
                if depth_edge1d[counter] == state:
                    state_cnt += 1
                else:
                    if state_cnt < 10:
                        for r in range(max(0, counter - 10), counter):
                            depth_edge1d[counter] = state
                    state_cnt = 0
                    state = depth_edge1d[counter]

            depth_edge1d = depth_edge1d * 255

            depth_edge2d_swap = np.array([
                np.ones(100) * awddawd for awddawd in depth_edge1d
            ]).astype(np.uint8)
            depth_edge2d = np.swapaxes(depth_edge2d_swap, 0, 1)

            for *coordinates, conf, cls_conf, cls in det:
                if classes[int(cls)] in RISKY_CLASSES:
                    label = '%s %.2f' % (classes[int(cls)], conf)
                    plot_one_box(coordinates,
                                 im0,
                                 label=label,
                                 color=colors[int(cls)])
                    print(f"+ Detected {classes[int(cls)]}")
                    x_avg_depth = np.mean(depth[coordinates[0] -
                                                5:coordinates[0] + 5])
                    y_avg_depth = np.mean(depth[coordinates[1] -
                                                5:coordinates[1] + 5])
                    detected_classes.append({
                        classes[int(cls)]: {
                            'x': coordinates[0],
                            'y': coordinates[1],
                            'z':
                            np.average(np.array([x_avg_depth, y_avg_depth]))
                        }
                    })

            n = []
            for counter in detected_classes:
                width = im0.shape[1]
                x, y, z = counter[list(counter.keys())[0]].values()
                phi = (x / width * 2 - 1) * (CAMERA_FOV / 2)
                n.append(f"{list(counter.keys())[0]};{phi};{z}|")
            sock.send(''.join(str(x) for x in n)[:-1].encode('utf-8'))
        print('+ Cycle took %.3fs' % (time.time() - t))
        plt.imshow(bgr_to_rgb(im0))
        plt.show(block=False)
        plt.pause(.001)
Beispiel #2
0
def train(cfg,
          data_cfg,
          img_size=416,
          resume=False,
          epochs=100,
          batch_size=16,
          accumulated_batches=1,
          multi_scale=False,
          freeze_backbone=True,
          var=0,
          weight_path="weights/rainy",
          result="result.txt",
          ckpt=10):
    weights = weight_path
    latest = weights + 'latest.pt'
    best = weights + 'best.pt'
    device = torch_utils.select_device()

    if multi_scale:  # pass maximum multi_scale size
        img_size = 608
    else:
        torch.backends.cudnn.benchmark = True  # unsuitable for multiscale

    # Configure run
    train_path = parse_data_cfg(data_cfg)['train']

    # Initialize model
    model = Darknet(cfg, img_size)

    # Get dataloader
    dataloader = LoadImagesAndLabels(train_path,
                                     batch_size,
                                     img_size,
                                     multi_scale=multi_scale,
                                     augment=True)

    lr0 = 0.001
    cutoff = 10  # backbone reaches to cutoff layer
    start_epoch = 0
    best_loss = float('inf')
    if resume:
        checkpoint = torch.load(latest, map_location='cpu')

        # Load weights to resume from
        model.load_state_dict(checkpoint['model'])

        # if torch.cuda.device_count() > 1:
        #   model = nn.DataParallel(model)
        model.to(device).train()

        # Transfer learning (train only YOLO layers)
        # for i, (name, p) in enumerate(model.named_parameters()):
        #     p.requires_grad = True if (p.shape[0] == 255) else False

        # Set optimizer
        optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad,
                                           model.parameters()),
                                    lr=lr0,
                                    momentum=.9)

        start_epoch = checkpoint['epoch'] + 1
        if checkpoint['optimizer'] is not None:
            optimizer.load_state_dict(checkpoint['optimizer'])
            best_loss = checkpoint['best_loss']

        del checkpoint  # current, saved

    else:
        # Initialize model with backbone (optional)
        if cfg.endswith('yolov3.cfg'):
            load_darknet_weights(model, weights + 'darknet53.conv.74')
            cutoff = 75
        elif cfg.endswith('yolov3-tiny.cfg'):
            load_darknet_weights(model, weights + 'yolov3-tiny.conv.15')
            cutoff = 15
        elif cfg.startswith('cfg/bdd100k'):
            #transfer learning
            print("Apply transfer learning for bdd100k cfg")
            tmp_model = Darknet('cfg/yolov3.cfg', img_size)
            load_darknet_weights(tmp_model, "weights/yolov3.weights")
            pretrained_dict = tmp_model.state_dict()

            for k, v in model.state_dict().items():
                if v.shape != pretrained_dict[k].shape:
                    pretrained_dict[k] = torch.empty(v.shape)
                    #TODO: conv, batch
                    if k.split(".")[2].startswith("conv"):
                        nn.init.normal_(pretrained_dict[k], 0.0, 0.03)
                    elif k.split(".")[2].startswith("batch_norm") and k.split(
                            ".")[3] == "weight":
                        nn.init.normal_(pretrained_dict[k], 1.0, 0.03)
                    elif k.split(".")[2].startswith("batch_norm") and k.split(
                            ".")[3] == "bias":
                        nn.init.constant_(pretrained_dict[k], 0.0)
                    else:
                        nn.init_normal_(pretrained_dict[k], torch.mean(v),
                                        torch.std(v))

                    print(k, v.shape)
            model.load_state_dict(pretrained_dict)
            del tmp_model
        #freeze_layer
        cutoff = 10
        model.freeze_layers(cutoff)
        model.to(device).train()

        # Set optimizer
        optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad,
                                           model.parameters()),
                                    lr=lr0,
                                    momentum=.9)

    # Set scheduler
    # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[54, 61], gamma=0.1)

    t0 = time.time()
    model_info(model)
    n_burnin = min(round(dataloader.nB / 5), 1000)  # number of burn-in batches
    for epoch in range(1, epochs + 1):
        epoch += start_epoch

        print(('%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf',
                                          'cls', 'total', 'nTargets', 'time'))

        # Update scheduler (automatic)
        # scheduler.step()

        # Update scheduler (manual)  at 0, 54, 61 epochs to 1e-3, 1e-4, 1e-5
        if epoch > 50:
            lr = lr0 / 10
        else:
            lr = lr0
        for g in optimizer.param_groups:
            g['lr'] = lr

        # Freeze darknet53.conv.74 for first epoch
        if freeze_backbone:
            for i, (name, p) in enumerate(model.named_parameters()):
                if int(name.split('.')[1]) < cutoff:  # if layer < 75
                    p.requires_grad = False if (epoch == 0) else True

        ui = -1
        rloss = defaultdict(float)  # running loss
        optimizer.zero_grad()
        for i, (imgs, targets, _, _, var) in enumerate(dataloader):
            if sum([len(x) for x in targets]) < 1:  # if no targets continue
                continue

            # SGD burn-in
            if (epoch == 0) & (i <= n_burnin):
                lr = lr0 * (i / n_burnin)**4
                for g in optimizer.param_groups:
                    g['lr'] = lr

            # Compute loss, compute gradient, update parameters
            loss = model(imgs.to(device), targets, var=0)
            loss.backward()

            # accumulate gradient for x batches before optimizing
            if ((i + 1) % accumulated_batches
                    == 0) or (i == len(dataloader) - 1):
                optimizer.step()
                optimizer.zero_grad()

            # Running epoch-means of tracked metrics
            ui += 1
            for key, val in model.losses.items():
                rloss[key] = (rloss[key] * ui + val) / (ui + 1)

            s = ('%8s%12s' + '%10.3g' * 7) % (
                '%g/%g' % (epoch, epochs + start_epoch), '%g/%g' %
                (i, len(dataloader) - 1), rloss['xy'], rloss['wh'],
                rloss['conf'], rloss['cls'], rloss['loss'], model.losses['nT'],
                time.time() - t0)
            t0 = time.time()
            print(s)
        # Update best loss
        loss_per_target = rloss['loss'] / rloss['nT']
        if loss_per_target < best_loss:
            best_loss = loss_per_target

        # Save latest checkpoint
        checkpoint = {
            'epoch': epoch,
            'best_loss': best_loss,
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        torch.save(checkpoint, latest)

        # Save best checkpoint
        if best_loss == loss_per_target:
            os.system('cp ' + latest + ' ' + best)

        # Save backup weights every 5 epochs (optional)
        if (epoch > 0) & (epoch % ckpt == 0):
            os.system('cp ' + latest + ' ' + weights +
                      'backup{}.pt'.format(epoch))

        # Calculate mAP
        with torch.no_grad():
            mAP, R, P = test.test(cfg,
                                  data_cfg,
                                  weights=latest,
                                  batch_size=batch_size,
                                  img_size=img_size)

        # Write epoch results
        with open(result, 'a') as file:
            file.write(s + '%11.3g' * 3 % (mAP, P, R) + '\n')
def app():
    cfg = 'ml-data/yolov3.cfg'
    global image_size
    image_size = 320
    weights = 'ml-data/weights/yolov3.weights'
    classes_file = 'ml-data/classes.txt'
    socket_ip = '10.10.10.1'
    # socket_ip = '127.0.0.1'
    socket_port = 1337

    print('+ Initializing model')
    global model
    model = Darknet(cfg, image_size)
    print('+ Loading model')
    load_darknet_weights(model, weights)
    print('+ Fusing model')
    model.fuse()
    print('+ Loading model to CPU')
    model.to('cpu').eval()
    print('+ Loading classes')
    global classes
    classes = load_classes(classes_file)
    global colors
    colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(classes))]
    print('+ Connecting to remote socket')
    global sock
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.connect((socket_ip, socket_port))

    while 1:
        #
        # Depth
        #
        depth_result = analyse_depth()
        depth_raw = depth_result["raw"]
        depth_done = depth_result["done"]
        depth_objects = depth_result["objects"]

        #
        # RGB
        #
        rgb_result = analyse_rgb()
        rgb_raw = rgb_result["raw"]
        rgb_done = rgb_result["done"]
        rgb_objects = rgb_result["objects"]

        print("FRAME [D]: " + depth_objects)
        print("FRAME [C]: " + rgb_objects)

        sock.send(bytes(f"{rgb_objects}|{depth_objects}".encode('utf-8')))

        time.sleep(0.01)

        # Plot
        vbar = np.zeros((depth_raw.shape[0], 5, 3)).astype(np.uint8)
        depthbar = np.concatenate((depth_raw, vbar, depth_done), axis=1)
        rgbbar = np.concatenate((rgb_raw, vbar, rgb_done), axis=1)
        hbar = np.zeros((5, depthbar.shape[1], 3)).astype(np.uint8)
        cv2.imshow('LineWarn', np.concatenate((depthbar, hbar, rgbbar), axis=0))

        if cv2.waitKey(10) == 27:
            break
Beispiel #4
0
def train(
    cfg,
    data_cfg,
    weights_from="",
    weights_to="",
    save_every=10,
    img_size=(1088, 608),
    resume=False,
    epochs=100,
    batch_size=16,
    accumulated_batches=1,
    freeze_backbone=False,
    opt=None,
):
    # The function starts
    NUM_WORKERS = opt.num_workers

    timme = strftime("%Y-%d-%m %H:%M:%S", gmtime())
    timme = timme[5:-3].replace('-', '_')
    timme = timme.replace(' ', '_')
    timme = timme.replace(':', '_')
    weights_to = osp.join(weights_to, 'run' + timme)
    mkdir_if_missing(weights_to)
    mkdir_if_missing(weights_to + '/cfg/')
    if resume:
        latest_resume = osp.join(weights_from, 'latest.pt')

    torch.backends.cudnn.benchmark = True  # unsuitable for multiscale

    # Configure run
    f = open(data_cfg)
    data_config = json.load(f)
    trainset_paths = data_config['train']
    dataset_root = data_config['root']
    f.close()

    transforms = T.Compose([T.ToTensor()])
    # Get dataloader
    dataset = JointDataset(dataset_root,
                           trainset_paths,
                           img_size,
                           augment=True,
                           transforms=transforms)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             num_workers=NUM_WORKERS,
                                             pin_memory=True,
                                             drop_last=True,
                                             collate_fn=collate_fn)
    # Initialize model
    model = Darknet(cfg, dataset.nID)

    cutoff = -1  # backbone reaches to cutoff layer
    start_epoch = 0
    if resume:
        checkpoint = torch.load(latest_resume, map_location='cpu')

        # Load weights to resume from
        model.load_state_dict(checkpoint['model'])
        model.cuda().train()

        # Set optimizer
        optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad,
                                           model.parameters()),
                                    lr=opt.lr,
                                    momentum=.9)

        start_epoch = checkpoint['epoch'] + 1
        if checkpoint['optimizer'] is not None:
            optimizer.load_state_dict(checkpoint['optimizer'])

        del checkpoint  # current, saved

    else:
        # Initialize model with backbone (optional)
        if cfg.endswith('yolov3.cfg'):
            load_darknet_weights(model,
                                 osp.join(weights_from, 'darknet53.conv.74'))
            cutoff = 75
        elif cfg.endswith('yolov3-tiny.cfg'):
            load_darknet_weights(model,
                                 osp.join(weights_from, 'yolov3-tiny.conv.15'))
            cutoff = 15

        model.cuda().train()

        # Set optimizer
        optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad,
                                           model.parameters()),
                                    lr=opt.lr,
                                    momentum=.9,
                                    weight_decay=1e-4)

    model = torch.nn.DataParallel(model)
    # Set scheduler
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer,
        milestones=[int(0.5 * opt.epochs),
                    int(0.75 * opt.epochs)],
        gamma=0.1)

    # An important trick for detection: freeze bn during fine-tuning
    if not opt.unfreeze_bn:
        for i, (name, p) in enumerate(model.named_parameters()):
            p.requires_grad = False if 'batch_norm' in name else True

    # model_info(model)
    t0 = time.time()
    for epoch in range(epochs):
        epoch += start_epoch
        logger.info(
            ('%8s%12s' + '%10s' * 6) % ('Epoch', 'Batch', 'box', 'conf', 'id',
                                        'total', 'nTargets', 'time'))

        # Freeze darknet53.conv.74 for first epoch
        if freeze_backbone and (epoch < 2):
            for i, (name, p) in enumerate(model.named_parameters()):
                if int(name.split('.')[2]) < cutoff:  # if layer < 75
                    p.requires_grad = False if (epoch == 0) else True

        ui = -1
        rloss = defaultdict(float)  # running loss

        ## training schedule
        optimizer.zero_grad()

        for i, (imgs, targets, _, _, targets_len) in enumerate(dataloader):
            if sum([len(x) for x in targets]) < 1:  # if no targets continue
                continue

            # SGD burn-in
            burnin = min(1000, len(dataloader))
            if (epoch == 0) & (i <= burnin):
                lr = opt.lr * (i / burnin)**4
                for g in optimizer.param_groups:
                    g['lr'] = lr

            # Compute loss, compute gradient, update parameters
            loss, components = model(imgs.cuda(), targets.cuda(),
                                     targets_len.cuda())
            components = torch.mean(components.view(-1, 5), dim=0)
            loss = torch.mean(loss)
            loss.backward()

            # accumulate gradient for x batches before optimizing
            if ((i + 1) % accumulated_batches
                    == 0) or (i == len(dataloader) - 1):
                optimizer.step()
                optimizer.zero_grad()

            # Running epoch-means of tracked metrics
            ui += 1

            for ii, key in enumerate(model.module.loss_names):
                rloss[key] = (rloss[key] * ui + components[ii]) / (ui + 1)

            # rloss indicates running loss values with mean updated at every epoch
            s = ('%8s%12s' + '%10.3g' * 6) % (
                '%g/%g' % (epoch, epochs - 1), '%g/%g' %
                (i, len(dataloader) - 1), rloss['box'], rloss['conf'],
                rloss['id'], rloss['loss'], rloss['nT'], time.time() - t0)
            t0 = time.time()
            if i % opt.print_interval == 0:
                logger.info(s)

        # Save latest checkpoint
        checkpoint = {
            'epoch': epoch,
            'model': model.module.state_dict(),
            'optimizer': optimizer.state_dict()
        }

        copyfile(cfg, weights_to + '/cfg/yolo3.cfg')
        copyfile(data_cfg, weights_to + '/cfg/ccmcpe.json')

        latest = osp.join(weights_to, 'latest.pt')
        torch.save(checkpoint, latest)
        if epoch % save_every == 0 and epoch != 0:
            # making the checkpoint lite
            checkpoint["optimizer"] = []
            torch.save(
                checkpoint,
                osp.join(weights_to, "weights_epoch_" + str(epoch) + ".pt"))

        # Calculate mAP
        '''
        if epoch % opt.test_interval == 0:
            with torch.no_grad():
                mAP, R, P = test.test(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size,
                                      print_interval=40, nID=dataset.nID)
                test.test_emb(cfg, data_cfg, weights=latest, batch_size=batch_size, img_size=img_size,
                              print_interval=40, nID=dataset.nID)
        '''

        # Call scheduler.step() after opimizer.step() with pytorch > 1.1.0
        scheduler.step()
Beispiel #5
0
def run(
        act_dtype=ng.int16,
        weight_dtype=ng.int8,
        bias_dtype=ng.int32,
        scale_dtype=ng.int8,
        disable_fusion=False,
        conv2d_par_ich=1,
        conv2d_par_och=1,
        conv2d_par_col=1,
        conv2d_par_row=1,
        conv2d_concur_och=None,
        conv2d_stationary='filter',
        pool_par=1,
        elem_par=1,
        chunk_size=64,
        axi_datawidth=32,
        silent=False,
        onnx_filename='yolov3-tiny.onnx',
        weight_filename='yolov3-tiny.npy',
        verilog_filename=None,
        sim_filename=None,
        # simtype=None,  # no RTL simulation
        # simtype='iverilog',
        simtype='verilator',
        cfg_filename='yolov3-tiny.cfg',
        weights_filename='yolov3-tiny.weights',
        model_path='yolov3'):

    # input mean and standard deviation
    imagenet_mean = np.array([0.485, 0.456, 0.406]).astype(np.float32)
    imagenet_std = np.array([0.229, 0.224, 0.225]).astype(np.float32)

    img_size = (416, 416)
    act_shape = (1, img_size[0], img_size[1], 3)

    # pytorch model
    model_url = "https://github.com/ultralytics/yolov3"
    if not os.path.isdir(model_path):
        raise FileNotFoundError(
            "Download the YOLOv3 model using Pytorch, such as "
            "'%s'. Then extract it, and rename it as '%s'" %
            (model_url, model_path))

    # Darknet model configuration and pretrained weights
    cfg_url = "https://github.com/pjreddie/darknet/blob/master/cfg/yolov3-tiny.cfg"
    if not os.path.isfile(cfg_filename):
        urllib.request.urlretrieve(cfg_url, cfg_filename)

    weights_url = "https://pjreddie.com/media/files/yolov3-tiny.weights"
    if not os.path.isfile(weights_filename):
        urllib.request.urlretrieve(weights_url, weights_filename)

    sys.path.insert(0, model_path)
    import models
    models.ONNX_EXPORT = True

    model = models.Darknet(cfg_filename, img_size).to('cpu')
    models.load_darknet_weights(model, weights_filename)

    # Pytorch to ONNX
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['scores', 'boxes']
    model.eval()
    torch.onnx.export(model,
                      dummy_input,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # ONNX to NNgen
    dtypes = {}
    shapes = {}
    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=dtypes,
                               value_shapes=shapes,
                               default_placeholder_dtype=act_dtype,
                               default_variable_dtype=weight_dtype,
                               default_constant_dtype=weight_dtype,
                               default_operator_dtype=act_dtype,
                               default_scale_dtype=scale_dtype,
                               default_bias_dtype=bias_dtype,
                               disable_fusion=disable_fusion,
                               verbose=False)

    # --------------------
    # (2) Assign quantized weights to the NNgen operators
    # --------------------

    if act_dtype.width > 8:
        act_scale_factor = 128
    else:
        act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5))

    input_scale_factors = {'act': act_scale_factor}
    input_means = {'act': imagenet_mean * act_scale_factor}
    input_stds = {'act': imagenet_std * act_scale_factor}

    ng.quantize(outputs, input_scale_factors, input_means, input_stds)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    for op in operators.values():
        if isinstance(op, ng.conv2d):
            op.attribute(par_ich=conv2d_par_ich,
                         par_och=conv2d_par_och,
                         par_col=conv2d_par_col,
                         par_row=conv2d_par_row,
                         concur_och=conv2d_concur_och,
                         stationary=conv2d_stationary)

        if isinstance(op, (ng.avg_pool, ng.max_pool, ng.avg_pool_serial,
                           ng.max_pool_serial)):
            op.attribute(par=pool_par)

        if ng.is_elementwise_operator(op):
            op.attribute(par=elem_par)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    act = placeholders['act']
    outs = (outputs['scores'], outputs['boxes'])

    # verification data
    img = np.array(PIL.Image.open('car416x416.png').convert('RGB')).astype(
        np.float32)
    img = img.reshape([1] + list(img.shape))

    img = img / 255
    img = (img - imagenet_mean) / imagenet_std

    # execution on pytorch
    model_input = img

    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_rslts = model(torch.from_numpy(model_input))
    model_outs = [rslt.detach().numpy() for rslt in model_rslts]
    model_outs = [(np.transpose(model_out, act.perm) if act.perm is not None
                   and len(model_out.shape) == len(act.shape) else model_out)
                  for model_out in model_outs]
    scaled_model_outs = [
        model_out * out.scale_factor
        for model_out, out in zip(model_outs, outs)
    ]

    # software-based verification
    vact = img * act_scale_factor
    vact = np.clip(vact, -1.0 * (2**(act.dtype.width - 1) - 1),
                   1.0 * (2**(act.dtype.width - 1) - 1))
    vact = np.round(vact).astype(np.int64)

    # compare outputs of hidden layers
    leaky_relu_ops = [
        v for k, v in operators.items()
        if (isinstance(v, ng.conv2d)
            and isinstance(v.act_func, ng.leaky_relu_base))
    ]
    leaky_relu_ops = list(sorted(set(leaky_relu_ops),
                                 key=leaky_relu_ops.index))

    conv2d_ops = [
        v for k, v in operators.items()
        if (isinstance(v, ng.conv2d) and v.act_func is None)
    ]
    conv2d_ops = list(sorted(set(conv2d_ops), key=conv2d_ops.index))

    # only 1st output
    sub_ops = leaky_relu_ops[:9] + conv2d_ops[:1]
    sub_outs = ng.eval(sub_ops, act=vact)
    sub_outs = [sub_out.transpose([0, 3, 1, 2]) for sub_out in sub_outs]
    sub_scale_factors = [sub_op.scale_factor for sub_op in sub_ops]

    model.eval()
    mouts = []
    # all Conv2d-LeakyReLU layers before YOLOLayer
    mouts.append(
        nn.Sequential(model.module_list[0])(
            torch.from_numpy(model_input)).detach().numpy())
    mouts.append(
        nn.Sequential(*model.module_list[0:3])(
            torch.from_numpy(model_input)).detach().numpy())
    mouts.append(
        nn.Sequential(*model.module_list[0:5])(
            torch.from_numpy(model_input)).detach().numpy())
    mouts.append(
        nn.Sequential(*model.module_list[0:7])(
            torch.from_numpy(model_input)).detach().numpy())
    mouts.append(
        nn.Sequential(*model.module_list[0:9])(
            torch.from_numpy(model_input)).detach().numpy())
    mouts.append(
        nn.Sequential(*model.module_list[0:11])(
            torch.from_numpy(model_input)).detach().numpy())
    mouts.append(
        nn.Sequential(*model.module_list[0:13])(
            torch.from_numpy(model_input)).detach().numpy())
    mouts.append(
        nn.Sequential(*model.module_list[0:14])(
            torch.from_numpy(model_input)).detach().numpy())
    mouts.append(
        nn.Sequential(*model.module_list[0:15])(
            torch.from_numpy(model_input)).detach().numpy())
    mouts.append(
        nn.Sequential(*model.module_list[0:16])(
            torch.from_numpy(model_input)).detach().numpy())

    scaled_mouts = [
        mout * scale_factor
        for mout, scale_factor in zip(mouts, sub_scale_factors)
    ]

    sub_mean_square_errors = [
        np.sum((sub_out - mout)**2) / sub_out.size
        for mout, sub_out in zip(scaled_mouts, sub_outs)
    ]
    sub_corrcoefs = [
        np.corrcoef(mout.reshape([-1]), sub_out.reshape([-1]))
        for mout, sub_out in zip(mouts, sub_outs)
    ]

    # compare prediction results
    vouts = ng.eval(outs, act=vact)

    mean_square_errors = [
        np.sum((vout - scaled_model_out)**2) / vout.size
        for vout, scaled_model_out in zip(vouts, scaled_model_outs)
    ]
    corrcoefs = [
        np.corrcoef(model_out.reshape([-1]), vout.reshape([-1]))
        for model_out, vout in zip(model_outs, vouts)
    ]

    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    # to Veriloggen object
    # targ = ng.to_veriloggen(outs, 'yolov3tiny', silent=silent,
    #                        config={'maxi_datawidth': axi_datawidth})

    # to IP-XACT (the method returns Veriloggen object, as well as to_veriloggen)
    targ = ng.to_ipxact(outs,
                        'yolov3tiny',
                        silent=silent,
                        config={'maxi_datawidth': axi_datawidth})

    # to Verilog HDL RTL (the method returns a source code text)
    # rtl = ng.to_verilog(outs, 'yolov3tiny', silent=silent,
    #                    config={'maxi_datawidth': axi_datawidth})

    # --------------------
    # (6) Save the quantized weights
    # --------------------
    param_data = ng.export_ndarray(outs, chunk_size)
    param_bytes = len(param_data)
    np.save(weight_filename, param_data)

    # --------------------
    # (7) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    variable_addr = int(math.ceil(
        (act.addr + act.memory_size) / chunk_size)) * chunk_size
    check0_addr = int(math.ceil(
        (variable_addr + param_bytes) / chunk_size)) * chunk_size
    check1_addr = int(
        math.ceil(
            (check0_addr + outs[0].memory_size) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil(
        (check1_addr + outs[1].memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    # mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)], dtype=np.int64)
    mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)],
                   dtype=np.int16)
    mem = mem + [100]

    # placeholder
    axi.set_memory(
        mem, vact, memimg_datawidth, act_dtype.width, act.addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(
        mem, vouts[0], memimg_datawidth, act_dtype.width, check0_addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_och))
    axi.set_memory(
        mem, vouts[1], memimg_datawidth, act_dtype.width, check1_addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if sim_filename is None:
        sim_filename = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + sim_filename

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(outs[0].shape[0]):
            for x in range(outs[0].shape[1]):
                orig = memory.read_word(bat * outs[0].aligned_shape[1] + x,
                                        outs[0].addr, act_dtype.width)
                check = memory.read_word(bat * outs[0].aligned_shape[1] + x,
                                         check0_addr, act_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG (', bat, x, ') orig: ', orig, ' check: ', check)
                    ok = False
                # else:
                #    print('OK (', bat, x,
                #          ') orig: ', orig, ' check: ', check)

        for bat in range(outs[1].shape[0]):
            for x in range(outs[1].shape[1]):
                orig = memory.read_word(bat * outs[1].aligned_shape[1] + x,
                                        outs[1].addr, act_dtype.width)
                check = memory.read_word(bat * outs[1].aligned_shape[1] + x,
                                         check1_addr, act_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG (', bat, x, ') orig: ', orig, ' check: ', check)
                    ok = False
                # else:
                #    print('OK (', bat, x,
                #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if verilog_filename is not None:
        m.to_verilog(verilog_filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=sim_filename)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt