Python FetchBlobs Examples, caffe2.python.workspace.FetchBlobs Python Examples

Example #1

0

Show file

def im_proposals(model, im):
    """Generate RPN proposals on a single image."""
    inputs = {}
    inputs['data'], inputs['im_info'] = _get_image_blob(im)
    for k, v in inputs.items():
        workspace.FeedBlob(core.ScopedName(k), v.astype(np.float32, copy=False))
    workspace.RunNet(model.net.Proto().name)
    scale = inputs['im_info'][0, 2]

    if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN:
        k_max = cfg.FPN.RPN_MAX_LEVEL
        k_min = cfg.FPN.RPN_MIN_LEVEL
        rois_names = [
            core.ScopedName('rpn_rois_fpn' + str(l))
            for l in range(k_min, k_max + 1)]
        score_names = [
            core.ScopedName('rpn_roi_probs_fpn' + str(l))
            for l in range(k_min, k_max + 1)]
        blobs = workspace.FetchBlobs(rois_names + score_names)
        # Combine predictions across all levels and retain the top scoring
        boxes = np.concatenate(blobs[:len(rois_names)])
        scores = np.concatenate(blobs[len(rois_names):]).squeeze()
        # TODO(rbg): NMS again?
        inds = np.argsort(-scores)[:cfg.TEST.RPN_POST_NMS_TOP_N]
        scores = scores[inds]
        boxes = boxes[inds, :]
    else:
        boxes, scores = workspace.FetchBlobs(
            [core.ScopedName('rpn_rois'), core.ScopedName('rpn_roi_probs')])
        scores = scores.squeeze()

    # Column 0 is the batch index in the (batch ind, x1, y1, x2, y2) encoding,
    # so we remove it since we just want to return boxes
    boxes = boxes[:, 1:] / scale
    return boxes, scores

Example #2

0

Show file

def test_nativeop_forward(
        model,
        inputs: List[Tuple[str, np.ndarray]],
        reference_outputs: List[Tuple[str, Optional[np.ndarray]]],
        metrics: List[TestMetric] = DefaultOpMetrics()
) -> List[Any]:
    """ Tests a framework-native operator's forward method on list of given
        input tensors, without surrounding overhead.
        Similar to `test_op_forward`, but does not incur the overhead of
        converting from/to numpy arrays, creating multiple sessions etc.

        @param op An operator to test.
        @param inputs An array of inputs to pass to the operator.
        @param reference_outputs An array of reference outputs.
        @param metrics A list of TestMetric objects to measure.
        @return List of test metric results for every output.
    """
    assert isinstance(inputs, (list, tuple)) == True

    normal_metrics = [m for m in metrics if m.reruns == 0]
    rerun_metrics = [m for m in metrics if m.reruns > 0]

    for (name, inp) in inputs:
        workspace.FeedBlob(name, inp)

    num_outputs = len(reference_outputs)

    # Create a single session
    workspace.CreateNet(model.net)

    for metric in normal_metrics:
        metric.begin(inputs)

    workspace.RunNet(model.net)
    outputs = workspace.FetchBlobs([name for (name, out) in reference_outputs])
    for metric in normal_metrics:
        metric.end(outputs)

    for metric in rerun_metrics:
        for i in range(metric.reruns):
            metric.begin(inputs)
            workspace.RunNet(model.net)
            outputs = workspace.FetchBlobs([name for (name, out) in reference_outputs])
            metric.end(outputs)

    if not isinstance(outputs, (list, tuple)):
        outputs = [outputs]

    results = []
    for i in range(num_outputs):
        # Execute metrics.
        for metric in metrics:
            result = metric.measure(inputs, outputs[i], (reference_outputs[i])[1])
            results.append(result)
            summary = metric.measure_summary(inputs, outputs[i],
                                             reference_outputs[i][1])
            print("{} on native inference for output {}: {}".format(
                type(metric).__name__, i, summary))

    return results

Example #3

0

Show file

    def _test_std(self):
        root_dir = osp.join('/private', 'home', 'xinleic', 'pyramid')
        cfg_file = osp.join(root_dir, 'configs', 'visual_genome', 'e2e_faster_rcnn_R-50-FPN_1x.yaml')
        merge_cfg_from_file(cfg_file)
        cfg.NUM_GPUS = 1
        cfg.TEST.RPN_PRE_NMS_TOP_N = 100
        cfg.TEST.RPN_POST_NMS_TOP_N = 20
        assert_and_infer_cfg()
        test_weight = osp.join(root_dir, 'outputs', 'train', 'visual_genome_train', 
                            'e2e_faster_rcnn_R-50-FPN_1x', 'RNG_SEED#3', 'model_final.pkl')
        model = test_engine.initialize_model_from_cfg(test_weight, gpu_id=0)
        dataset = JsonDataset('visual_genome_val')
        roidb = dataset.get_roidb()
        num_images = len(roidb)
        num_classes = cfg.MODEL.NUM_CLASSES
        entry = roidb[1]
        im = cv2.imread(entry['image'])
        max_level = cfg.FPN.RPN_MAX_LEVEL
        min_level = cfg.FPN.RPN_MIN_LEVEL
        # input: rpn_cls_probs_fpn2, rpn_bbox_pred_fpn2
        # output: rpn_rois_fpn2, rpn_roi_probs_fpn2
        with utils.c2.NamedCudaScope(0):
            # let's manually do the testing here
            inputs, im_scale = _get_blobs(im, None, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE)

            for k, v in inputs.items():
                workspace.FeedBlob(core.ScopedName(k), v)

            workspace.RunNet(model.net.Proto().name)
            cls_probs = [core.ScopedName('rpn_cls_probs_fpn%d' % i) for i in range(min_level, max_level+1)]
            box_preds = [core.ScopedName('rpn_bbox_pred_fpn%d' % i) for i in range(min_level, max_level+1)]
            rpn_rois = [core.ScopedName('rpn_rois_fpn%d' % i) for i in range(min_level, max_level+1)]
            rpn_roi_probs = [core.ScopedName('rpn_roi_probs_fpn%d' % i) for i in range(min_level, max_level+1)]

            cls_probs = workspace.FetchBlobs(cls_probs)
            box_preds = workspace.FetchBlobs(box_preds)
            rpn_rois = workspace.FetchBlobs(rpn_rois)
            rpn_roi_probs = workspace.FetchBlobs(rpn_roi_probs)

        rpn_rois = np.vstack(rpn_rois)
        rpn_roi_probs = np.vstack(rpn_roi_probs)
        # remove the image dimension
        rpn_rois = rpn_rois[:, 1:]
        boxes = np.hstack([rpn_rois, rpn_roi_probs])
        im_name = osp.splitext(osp.basename(entry['image']))[0]
        utils.vis.vis_one_image(im[:, :, ::-1],
                                '{:s}-std-output'.format(im_name),
                                osp.join(root_dir, 'tests'),
                                boxes,
                                segms=None,
                                keypoints=None,
                                thresh=0.,
                                box_alpha=0.8,
                                dataset=dataset,
                                show_class=False) 
        workspace.ResetWorkspace()
        im_info = inputs['im_info'].astype(np.float32)

        return cls_probs, box_preds, im_info, im, im_name, root_dir, dataset

Example #4

0

Show file

    def test_log_barrier(self, X):
        param = core.BlobReference("X")
        workspace.FeedBlob(param, X)
        train_init_net, train_net = self.get_training_nets()
        reg = regularizer.LogBarrier(1.0)
        output = reg(train_net,
                     train_init_net,
                     param,
                     by=RegularizationBy.ON_LOSS)
        reg(
            train_net,
            train_init_net,
            param,
            grad=None,
            by=RegularizationBy.AFTER_OPTIMIZER,
        )
        workspace.RunNetOnce(train_init_net)
        workspace.RunNetOnce(train_net)

        def ref(X):
            return (
                np.array(np.sum(-np.log(np.clip(X, 1e-9, None))) * 0.5).astype(
                    np.float32),
                np.clip(X, 1e-9, None),
            )

        for x, y in zip(workspace.FetchBlobs([output, param]), ref(X)):
            npt.assert_allclose(x, y, rtol=1e-3)

Example #5

0

Show file

 def testFetchBlobs(self):
     s1 = "test1"
     s2 = "test2"
     workspace.FeedBlob('s1', s1)
     workspace.FeedBlob('s2', s2)
     fetch1, fetch2 = workspace.FetchBlobs(['s1', 's2'])
     self.assertEquals(s1, fetch1)
     self.assertEquals(s2, fetch2)

Example #6

0

Show file

File: workspace_test.py Project: JonghyunBae/FlashNeuron

 def testFetchBlobs(self):
     s1 = b"test1"
     s2 = b"test2"
     workspace.FeedBlob("s1", s1)
     workspace.FeedBlob("s2", s2)
     fetch1, fetch2 = workspace.FetchBlobs(["s1", "s2"])
     self.assertEquals(s1, fetch1)
     self.assertEquals(s2, fetch2)

Example #7

0

Show file

File: negate_gradient_op_test.py Project: wolfviking0/caffe2_SSD

    def test_grad(self, size):
        X = np.random.random_sample(size)
        workspace.ResetWorkspace()
        workspace.FeedBlob("X", X.astype(np.float32))

        net = core.Net("negate_grad_test")
        Y = net.NegateGradient(["X"], ["Y"])

        grad_map = net.AddGradientOperators([Y])
        workspace.RunNetOnce(net)

        # check X_grad == negate of Y_grad
        x_val, y_val = workspace.FetchBlobs(['X', 'Y'])
        x_grad_val, y_grad_val = workspace.FetchBlobs(
            [grad_map['X'], grad_map['Y']])
        np.testing.assert_array_equal(x_val, y_val)
        np.testing.assert_array_equal(x_grad_val, y_grad_val * (-1))

Example #8

0

Show file

File: nomnigraph_transformations_test.py Project: awthomp/pytorch-dev

 def test_transpose_network(self, batch_size, channels, height, width, seed,
                            kernel):
     net = core.Net("net")
     net.Conv(["X", "w1", "b1"], ["c1"], stride=1, pad=0, kernel=kernel)
     net.Conv(["X", "w2", "b2"], ["c2"], stride=1, pad=0, kernel=kernel)
     # c1 and c2: batch_size, 2*channels, height - kernel + 1, width - kernel + 1
     net.Conv(["c1", "w3", "b3"], ["c3"], stride=1, pad=0, kernel=kernel)
     net.Conv(["c1", "w4", "b4"], ["c4"], stride=1, pad=0, kernel=kernel)
     # c3 and c4: batch_size, 2*channels, height - 2*kernel + 2, width - 2*kernel + 2
     net.Flatten(["c3"], "c3f")
     net.Flatten(["c4"], "c4f")
     net.Flatten(["X"], "Xf")
     net.Concat(["c3f", "c4f", "Xf"], ["out", "split_info"],
                axis=1,
                add_axis=0)
     np.random.seed(seed)
     workspace.ResetWorkspace()
     tu.randBlobFloat32("X", batch_size, channels, height, width)
     tu.randBlobsFloat32(["w1", "w2"], 2 * channels, channels, kernel,
                         kernel)
     tu.randBlobsFloat32(["b1", "b2"], 2 * channels)
     tu.randBlobsFloat32(["w3", "w4"], 4 * channels, 2 * channels, kernel,
                         kernel)
     tu.randBlobsFloat32(["b3", "b4"], 4 * channels)
     all_inp_names = ["X", "w1", "w2", "b1", "b2", "w3", "w4", "b3", "b4"]
     all_input = workspace.FetchBlobs(all_inp_names)
     workspace.RunNetOnce(net)
     preTransformC1 = workspace.FetchBlob("c1")
     preTransformC3 = workspace.FetchBlob("c3")
     preTransformOut = workspace.FetchBlob("out")
     nn = ng.NNModule(net)
     preTransformNumOperators = len(nn.operators)
     preTransformNumTensors = len(nn.tensors)
     transpose_network(nn)
     new_netdef = nn.convertToCaffe2Proto()
     postTransformNumOperators = len(nn.operators)
     postTransformNumTensors = len(nn.tensors)
     # The minimal number of additional operators and tensors is at least one
     # NCHW2NHWC operator and tensor for each channel-based input tensor
     # and a NHWC2NCHW operator and tensor for the output of each convolution
     # X, w1, w2, w3, w4 are channel-based inputs
     # c1, c2, c3, c4 are the outputs of convolutions
     # i.e. a total of 9.
     self.assertEqual(postTransformNumOperators,
                      preTransformNumOperators + 9,
                      "expected 9 additional operators")
     self.assertEqual(postTransformNumTensors, preTransformNumTensors + 9,
                      "expected 9 additional tensors")
     workspace.ResetWorkspace()
     for name, val in zip(all_inp_names, all_input):
         workspace.FeedBlob(name, val)
     workspace.RunNetOnce(new_netdef)
     postTransformC1 = workspace.FetchBlob("c1")
     postTransformC3 = workspace.FetchBlob("c3")
     postTransformOut = workspace.FetchBlob("out")
     np.testing.assert_almost_equal(postTransformC1, preTransformC1, 1)
     np.testing.assert_almost_equal(postTransformC3, preTransformC3, 1)
     np.testing.assert_almost_equal(postTransformOut, preTransformOut, 1)

Example #9

0

Show file

File: rpn_generator.py Project: assafmus/Detectron

def im_proposals(model, im):
    """Generate RPN proposals on a single image."""
    inputs = {}
    inputs['data'], inputs['im_info'] = _get_image_blob(im)
    scale = inputs['im_info'][0, 2]
    for k, v in inputs.items():
        workspace.FeedBlob(core.ScopedName(k), v.astype(np.float32,
                                                        copy=False))
    workspace.RunNet(model.net.Proto().name)

    if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_RPN:
        k_max = cfg.FPN.RPN_MAX_LEVEL
        k_min = cfg.FPN.RPN_MIN_LEVEL
        rois_names = [
            core.ScopedName('rpn_rois_fpn' + str(l))
            for l in range(k_min, k_max + 1)
        ]
        score_names = [
            core.ScopedName('rpn_roi_probs_fpn' + str(l))
            for l in range(k_min, k_max + 1)
        ]
        blobs = workspace.FetchBlobs(rois_names + score_names)
        # Combine predictions across all levels and retain the top scoring
        boxes = np.concatenate(blobs[:len(rois_names)])
        scores = np.concatenate(blobs[len(rois_names):]).squeeze()
        # Discussion: one could do NMS again after combining predictions from
        # the different FPN levels. Conceptually, it's probably the right thing
        # to do. For arbitrary reasons, the original FPN RPN implementation did
        # not do another round of NMS.
        inds = np.argsort(-scores)[:cfg.TEST.RPN_POST_NMS_TOP_N]
        scores = scores[inds]
        boxes = boxes[inds, :]
    else:
        boxes, scores = workspace.FetchBlobs(
            [core.ScopedName('rpn_rois'),
             core.ScopedName('rpn_roi_probs')])
        scores = scores.squeeze()

    # Column 0 is the batch index in the (batch ind, x1, y1, x2, y2) encoding,
    # so we remove it since we just want to return boxes
    # Scale proposals back to the original input image scale
    boxes = boxes[:, 1:] / scale
    return boxes, scores

Example #10

0

Show file

File: test_cpu_v2_detection_op.py Project: zhengant/detectron-vlp

    def _run_general_op_cpu(self, op_name, blobs_in, values_in, blobs_out, **kwargs):
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
            op = core.CreateOperator(op_name, blobs_in, blobs_out, **kwargs)
            for name, value in zip(blobs_in, values_in):
                workspace.FeedBlob(name, value)

        workspace.RunOperatorOnce(op)
        values_out = workspace.FetchBlobs(blobs_out)
        workspace.ResetWorkspace()

        return values_out

Example #11

0

Show file

File: test_retinanet.py Project: v-chixma/detectron

def im_detect_bbox(model, im, timers=None):
    """Generate RetinaNet detections on a single image."""
    if timers is None:
        timers = defaultdict(Timer)
    # Although anchors are input independent and could be precomputed,
    # recomputing them per image only brings a small overhead
    anchors = _create_cell_anchors()
    timers['im_detect_bbox'].tic()
    k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL
    A = cfg.RETINANET.SCALES_PER_OCTAVE * len(cfg.RETINANET.ASPECT_RATIOS)
    inputs = {}
    inputs['data'], inputs['im_info'] = _get_image_blob(im)
    cls_probs, box_preds = [], []
    for lvl in range(k_min, k_max + 1):
        suffix = 'fpn{}'.format(lvl)
        cls_probs.append(core.ScopedName('retnet_cls_prob_{}'.format(suffix)))
        box_preds.append(core.ScopedName('retnet_bbox_pred_{}'.format(suffix)))
    for k, v in inputs.items():
        workspace.FeedBlob(core.ScopedName(k), v.astype(np.float32,
                                                        copy=False))

    workspace.RunNet(model.net.Proto().name)
    scale = inputs['im_info'][0, 2]
    cls_probs = workspace.FetchBlobs(cls_probs)
    box_preds = workspace.FetchBlobs(box_preds)

    # here the boxes_all are [x0, y0, x1, y1, score]
    boxes_all = defaultdict(list)

    cnt = 0
    for lvl in range(k_min, k_max + 1):
        # create cell anchors array
        stride = 2.**lvl
        cell_anchors = anchors[lvl]

        # fetch per level probability
        cls_prob = cls_probs[cnt]
        box_pred = box_preds[cnt]
        cls_prob = cls_prob.reshape(
            (cls_prob.shape[0], A, int(cls_prob.shape[1] / A),
             cls_prob.shape[2], cls_prob.shape[3]))
        box_pred = box_pred.reshape(
            (box_pred.shape[0], A, 4, box_pred.shape[2], box_pred.shape[3]))
        cnt += 1

        if cfg.RETINANET.SOFTMAX:
            cls_prob = cls_prob[:, :, 1::, :, :]

        cls_prob_ravel = cls_prob.ravel()
        # In some cases [especially for very small img sizes], it's possible that
        # candidate_ind is empty if we impose threshold 0.05 at all levels. This
        # will lead to errors since no detections are found for this image. Hence,
        # for lvl 7 which has small spatial resolution, we take the threshold 0.0
        th = cfg.RETINANET.INFERENCE_TH if lvl < k_max else 0.0
        candidate_inds = np.where(cls_prob_ravel > th)[0]
        if (len(candidate_inds) == 0):
            continue

        pre_nms_topn = min(cfg.RETINANET.PRE_NMS_TOP_N, len(candidate_inds))
        inds = np.argpartition(cls_prob_ravel[candidate_inds],
                               -pre_nms_topn)[-pre_nms_topn:]
        inds = candidate_inds[inds]

        inds_5d = np.array(np.unravel_index(inds, cls_prob.shape)).transpose()
        classes = inds_5d[:, 2]
        anchor_ids, y, x = inds_5d[:, 1], inds_5d[:, 3], inds_5d[:, 4]
        scores = cls_prob[:, anchor_ids, classes, y, x]

        boxes = np.column_stack((x, y, x, y)).astype(dtype=np.float32)
        boxes *= stride
        boxes += cell_anchors[anchor_ids, :]

        if not cfg.RETINANET.CLASS_SPECIFIC_BBOX:
            box_deltas = box_pred[0, anchor_ids, :, y, x]
        else:
            box_cls_inds = classes * 4
            box_deltas = np.vstack([
                box_pred[0, ind:ind + 4, yi, xi]
                for ind, yi, xi in zip(box_cls_inds, y, x)
            ])
        pred_boxes = (box_utils.bbox_transform(boxes, box_deltas)
                      if cfg.TEST.BBOX_REG else boxes)
        pred_boxes /= scale
        pred_boxes = box_utils.clip_tiled_boxes(pred_boxes, im.shape)
        box_scores = np.zeros((pred_boxes.shape[0], 5))
        box_scores[:, 0:4] = pred_boxes
        box_scores[:, 4] = scores

        for cls in range(1, cfg.MODEL.NUM_CLASSES):
            inds = np.where(classes == cls - 1)[0]
            if len(inds) > 0:
                boxes_all[cls].extend(box_scores[inds, :])
    timers['im_detect_bbox'].toc()

    # Combine predictions across all levels and retain the top scoring by class
    timers['misc_bbox'].tic()
    detections = []
    for cls, boxes in boxes_all.items():
        cls_dets = np.vstack(boxes).astype(dtype=np.float32)
        # do class specific nms here
        keep = box_utils.nms(cls_dets, cfg.TEST.NMS)
        cls_dets = cls_dets[keep, :]
        out = np.zeros((len(keep), 6))
        out[:, 0:5] = cls_dets
        out[:, 5].fill(cls)
        detections.append(out)

    # detections (N, 6) format:
    #   detections[:, :4] - boxes
    #   detections[:, 4] - scores
    #   detections[:, 5] - classes
    detections = np.vstack(detections)
    # sort all again
    inds = np.argsort(-detections[:, 4])
    detections = detections[inds[0:cfg.TEST.DETECTIONS_PER_IM], :]

    # Convert the detections to image cls_ format (see core/test_engine.py)
    num_classes = cfg.MODEL.NUM_CLASSES
    cls_boxes = [[] for _ in range(cfg.MODEL.NUM_CLASSES)]
    for c in range(1, num_classes):
        inds = np.where(detections[:, 5] == c)[0]
        cls_boxes[c] = detections[inds, :5]
    timers['misc_bbox'].toc()

    return cls_boxes

Example #12

0

Show file

def make_nps(xs):
    assert isinstance(xs,
                      list), 'ERROR: should pass list of names of the blobs'
    return workspace.FetchBlobs(xs)

Example #13

0

Show file

    def test_record_queue(self):
        num_prod = 8
        num_consume = 3
        schema = Struct(
            ('floats', Map(Scalar(np.int32), Scalar(np.float32))), )
        contents_raw = [
            [1, 2, 3],  # len
            [11, 21, 22, 31, 32, 33],  # key
            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
        ]
        contents = from_blob_list(schema, contents_raw)
        ds = Dataset(schema)
        net = core.Net('init')
        ds.init_empty(net)

        content_blobs = NewRecord(net, contents)
        FeedRecord(content_blobs, contents)
        writer = ds.writer(init_net=net)
        writer.write_record(net, content_blobs)
        reader = ds.reader(init_net=net)

        # prepare receiving dataset
        rec_dataset = Dataset(contents, name='rec')
        rec_dataset.init_empty(init_net=net)
        rec_dataset_writer = rec_dataset.writer(init_net=net)

        workspace.RunNetOnce(net)

        queue = RecordQueue(contents, num_threads=num_prod)

        def process(net, fields):
            new_fields = []
            for f in fields.field_blobs():
                new_f = net.Copy(f)
                new_fields.append(new_f)
            new_fields = from_blob_list(fields, new_fields)
            return new_fields

        q_reader, q_step, q_exit, fields = queue.build(reader, process)
        producer_step = core.execution_step('producer', [q_step, q_exit])

        consumer_steps = []
        for i in range(num_consume):
            name = 'queue_reader_' + str(i)
            net_consume = core.Net(name)
            should_stop, fields = q_reader.read_record(net_consume)
            step_consume = core.execution_step(name, net_consume)

            name = 'dataset_writer_' + str(i)
            net_dataset = core.Net(name)
            rec_dataset_writer.write(net_dataset, fields.field_blobs())
            step_dataset = core.execution_step(name, net_dataset)

            step = core.execution_step('consumer_' + str(i),
                                       [step_consume, step_dataset],
                                       should_stop_blob=should_stop)
            consumer_steps.append(step)
        consumer_step = core.execution_step('consumers',
                                            consumer_steps,
                                            concurrent_substeps=True)

        work_steps = core.execution_step('work',
                                         [producer_step, consumer_step],
                                         concurrent_substeps=True)

        plan = core.Plan('test')
        plan.AddStep(work_steps)
        core.workspace.RunPlan(plan)
        data = workspace.FetchBlobs(rec_dataset.get_blobs())
        self.assertEqual(6, sum(data[0]))
        self.assertEqual(150, sum(data[1]))
        self.assertAlmostEqual(15, sum(data[2]), places=5)

Example #14

0

Show file

def im_detect_bbox(model, im, timers=None, model1=None):
    """Generate RetinaNet detections on a single image."""
    if timers is None:
        timers = defaultdict(Timer)

    if model1 is None and os.environ.get('COSIM'):
        print("cosim must has model1")

    fp32_ws_name = "__fp32_ws__"
    int8_ws_name = "__int8_ws__"
    # Although anchors are input independent and could be precomputed,
    # recomputing them per image only brings a small overhead
    anchors = _create_cell_anchors()
    timers['im_detect_bbox'].tic()
    timers['data1'].tic()
    k_max, k_min = cfg.FPN.RPN_MAX_LEVEL, cfg.FPN.RPN_MIN_LEVEL
    A = cfg.RETINANET.SCALES_PER_OCTAVE * len(cfg.RETINANET.ASPECT_RATIOS)
    inputs = {}
    inputs['data'], im_scale, inputs['im_info'] = \
        blob_utils.get_image_blob(im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE, cfg.TEST.SIZEFIX)
    cls_probs, box_preds = [], []
    for lvl in range(k_min, k_max + 1):
        suffix = 'fpn{}'.format(lvl)
        cls_probs.append(core.ScopedName('retnet_cls_prob_{}'.format(suffix)))
        box_preds.append(core.ScopedName('retnet_bbox_pred_{}'.format(suffix)))
    for k, v in inputs.items():
        if os.environ.get('COSIM'):
            workspace.SwitchWorkspace(int8_ws_name, True)
        workspace.FeedBlob(core.ScopedName(k), v.astype(np.float32,
                                                        copy=False))
        if os.environ.get('COSIM'):
            workspace.SwitchWorkspace(fp32_ws_name, True)
            workspace.FeedBlob(core.ScopedName(k),
                               v.astype(np.float32, copy=False))
    timers['data1'].toc()
    if os.environ.get('EPOCH2OLD') == "1":
        workspace.RunNet(model.net.Proto().name)
    timers['run'].tic()
    if os.environ.get('INT8INFO') == "1":
        algorithm = AbsmaxCalib()
        kind = os.environ.get('INT8CALIB')
        if kind == "moving_average":
            ema_alpha = 0.5
            algorithm = EMACalib(ema_alpha)
        elif kind == "kl_divergence":

            kl_iter_num_for_range = os.environ.get('INT8KLNUM')
            if not kl_iter_num_for_range:
                kl_iter_num_for_range = 100
            kl_iter_num_for_range = int(kl_iter_num_for_range)
            algorithm = KLCalib(kl_iter_num_for_range)
        calib = Calibrator(algorithm)
        calib.RunCalibIter(workspace, model.net.Proto())
    else:
        if os.environ.get('COSIM'):
            with open("int8.txt", "wb") as p:
                p.write(str(model.net.Proto()))
            with open("fp32.txt", "wb") as p:
                p.write(str(model1.net.Proto()))
            for i in range(len(model.net.Proto().op)):
                workspace.SwitchWorkspace(int8_ws_name)
                int8_inputs = []
                for inp in model.net.Proto().op[i].input:
                    int8_inputs.append(workspace.FetchBlob(str(inp)))
                logging.warning(" opint8[{0}] is  {1}".format(
                    i,
                    model.net.Proto().op[i]))
                workspace.RunOperatorOnce(model.net.Proto().op[i])
                int8_results = []
                for res in model.net.Proto().op[i].output:
                    int8_results.append(workspace.FetchBlob(str(res)))
                workspace.SwitchWorkspace(fp32_ws_name)
                fp32_inputs = []
                for inp1 in model1.net.Proto().op[i].input:
                    fp32_inputs.append(workspace.FetchBlob(str(inp1)))
                logging.warning(" opfp32[{0}] is  {1}".format(
                    i,
                    model1.net.Proto().op[i]))
                workspace.RunOperatorOnce(model1.net.Proto().op[i])
                fp32_results = []
                for res1 in model1.net.Proto().op[i].output:
                    fp32_results.append(workspace.FetchBlob(str(res1)))
                if len(int8_inputs) != len(fp32_inputs):
                    logging.error("Wrong number of inputs")
                    return
                if len(int8_results) != len(fp32_results):
                    logging.error("Wrong number of outputs")
                    return
                logging.warning("begin to check op[{}] {} input".format(
                    i,
                    model.net.Proto().op[i].type))
                for k in range(len(int8_inputs)):
                    if model.net.Proto().op[i].input[k][0] == '_':
                        continue
                    #assert_allclose(int8_inputs[k], fp32_inputs[k], **tol)
                logging.warning("pass checking op[{0}] {1} input".format(
                    i,
                    model.net.Proto().op[i].type))
                logging.warning("begin to check op[{0}] {1} output".format(
                    i,
                    model.net.Proto().op[i].type))
                for j, int8_result in enumerate(int8_results):
                    if model.net.Proto().op[i].output[j][0] == '_':
                        continue
                    #logging.warning("int8_outputis {} and fp32 output is {} ".format(int8_results[j], fp32_results[j]))
                    #if not compare_utils.assert_allclose(int8_results[j], fp32_results[j], **tol):
                    if not compare_utils.assert_compare(
                            int8_result, fp32_results[j], 1e-01,
                            os.environ.get('COSIM')):
                        for k, int8_input in enumerate(int8_inputs):
                            logging.warning("int8_input[{}] is {}".format(
                                k, int8_input))
                            logging.warning("fp32_input[{}] is {}".format(
                                k, fp32_inputs[k]))

                logging.warning("pass checking op[{0}] {1} output".format(
                    i,
                    model.net.Proto().op[i].type))
        else:
            workspace.RunNet(model.net.Proto().name)
    timers['run'].toc()
    cls_probs = workspace.FetchBlobs(cls_probs)
    box_preds = workspace.FetchBlobs(box_preds)

    # here the boxes_all are [x0, y0, x1, y1, score]
    boxes_all = defaultdict(list)

    batch_size = cls_probs[0].shape[0]
    boxes_all_list = [boxes_all] * batch_size
    cnt = 0
    for lvl in range(k_min, k_max + 1):
        # create cell anchors array
        stride = 2.**lvl
        cell_anchors = anchors[lvl]

        # fetch per level probability
        cls_prob = cls_probs[cnt]
        box_pred = box_preds[cnt]
        cls_prob = cls_prob.reshape(
            (cls_prob.shape[0], A, int(cls_prob.shape[1] / A),
             cls_prob.shape[2], cls_prob.shape[3]))
        box_pred = box_pred.reshape(
            (box_pred.shape[0], A, 4, box_pred.shape[2], box_pred.shape[3]))
        cnt += 1

        if cfg.RETINANET.SOFTMAX:
            cls_prob = cls_prob[:, :, 1::, :, :]

        for i in range(batch_size):
            cls_prob_ravel = cls_prob[i, :].ravel()

            # In some cases [especially for very small img sizes], it's possible that
            # candidate_ind is empty if we impose threshold 0.05 at all levels. This
            # will lead to errors since no detections are found for this image. Hence,
            # for lvl 7 which has small spatial resolution, we take the threshold 0.0
            th = cfg.RETINANET.INFERENCE_TH if lvl < k_max else 0.0
            candidate_inds = np.where(cls_prob_ravel > th)[0]
            if (len(candidate_inds) == 0):
                continue

            pre_nms_topn = min(cfg.RETINANET.PRE_NMS_TOP_N,
                               len(candidate_inds))
            inds = np.argpartition(cls_prob_ravel[candidate_inds],
                                   -pre_nms_topn)[-pre_nms_topn:]
            inds = candidate_inds[inds]

            inds_4d = np.array(np.unravel_index(
                inds, (cls_prob[i, :]).shape)).transpose()
            classes = inds_4d[:, 1]
            anchor_ids, y, x = inds_4d[:, 0], inds_4d[:, 2], inds_4d[:, 3]
            scores = cls_prob[i, anchor_ids, classes, y, x]
            boxes = np.column_stack((x, y, x, y)).astype(dtype=np.float32)
            boxes *= stride
            boxes += cell_anchors[anchor_ids, :]

            if not cfg.RETINANET.CLASS_SPECIFIC_BBOX:
                box_deltas = box_pred[i, anchor_ids, :, y, x]
            else:
                box_cls_inds = classes * 4
                box_deltas = np.vstack([
                    box_pred[i, ind:ind + 4, yi, xi]
                    for ind, yi, xi in zip(box_cls_inds, y, x)
                ])
            pred_boxes = (box_utils.bbox_transform(boxes, box_deltas)
                          if cfg.TEST.BBOX_REG else boxes)
            pred_boxes /= im_scale
            pred_boxes = box_utils.clip_tiled_boxes(pred_boxes, im[0].shape)
            box_scores = np.zeros((pred_boxes.shape[0], 5))
            box_scores[:, 0:4] = pred_boxes
            box_scores[:, 4] = scores

            for cls in range(1, cfg.MODEL.NUM_CLASSES):
                inds = np.where(classes == cls - 1)[0]
                if len(inds) > 0:
                    boxes_all_list[i][cls].extend(box_scores[inds, :])

    timers['im_detect_bbox'].toc()

    cls_boxes_list = []
    for i in range(batch_size):
        boxes_all = boxes_all_list[i]
        # Combine predictions across all levels and retain the top scoring by class
        timers['misc_bbox'].tic()
        detections = []
        for cls, boxes in boxes_all.items():
            cls_dets = np.vstack(boxes).astype(dtype=np.float32)
            # do class specific nms here
            keep = box_utils.nms(cls_dets, cfg.TEST.NMS)
            cls_dets = cls_dets[keep, :]
            out = np.zeros((len(keep), 6))
            out[:, 0:5] = cls_dets
            out[:, 5].fill(cls)
            detections.append(out)

        # detections (N, 6) format:
        #   detections[:, :4] - boxes
        #   detections[:, 4] - scores
        #   detections[:, 5] - classes
        detections = np.vstack(detections)
        # sort all again
        inds = np.argsort(-detections[:, 4])
        detections = detections[inds[0:cfg.TEST.DETECTIONS_PER_IM], :]

        # Convert the detections to image cls_ format (see core/test_engine.py)
        num_classes = cfg.MODEL.NUM_CLASSES
        cls_boxes = [[] for _ in range(cfg.MODEL.NUM_CLASSES)]
        for c in range(1, num_classes):
            inds = np.where(detections[:, 5] == c)[0]
            cls_boxes[c] = detections[inds, :5]
        cls_boxes_list.append(cls_boxes)

    timers['misc_bbox'].toc()

    return cls_boxes_list