コード例 #1
0
def cat_boxlist(bboxes):
    """
    Concatenates a list of BoxList (having the same image size) into a
    single BoxList

    Arguments:
        bboxes (list[BoxList])
    """
    assert isinstance(bboxes, (list, tuple))
    assert all(isinstance(bbox, BoxList) for bbox in bboxes)
    if len(bboxes)==0:
        return BoxList(jt.empty((0,4)), (0,0), mode='xyxy')
    
    size = bboxes[0].size
    assert all(bbox.size == size for bbox in bboxes)

    mode = bboxes[0].mode
    assert all(bbox.mode == mode for bbox in bboxes)

    fields = set(bboxes[0].fields())
    assert all(set(bbox.fields()) == fields for bbox in bboxes)
    cat_boxes = BoxList(_cat([bbox.bbox for bbox in bboxes], dim=0), size, mode)
    for field in fields:
        data = _cat([bbox.get_field(field) for bbox in bboxes], dim=0)
        cat_boxes.add_field(field, data)
    return cat_boxes
コード例 #2
0
ファイル: mask_prob.py プロジェクト: li-xl/detectron.jittor
def mask_prob_cuda(embed_pixel, embed_center, sigma_center, boxes, box_areas,
                   area_sum, mask_width):
    assert embed_pixel.ndim == 2, "embed_pixel should be MxDim"
    assert embed_center.ndim == 2, "embed_center should be NxDim"
    assert sigma_center.ndim == 1, "sigma_center should be N"
    assert embed_pixel.shape[1] == embed_center.shape[1], "Dim should the same"
    assert embed_center.shape[0] == sigma_center.shape[
        0], "center number should be the same"
    assert embed_center.shape[0] == boxes.shape[
        0], "center number and box number should be the same"

    output_shape = (embed_pixel.shape[0], embed_center.shape[0])
    if output_shape[0] * output_shape[1] == 0:
        return jt.array([], embed_pixel.dtype)
    output_type = embed_pixel.dtype
    option = jt.empty((0, ))
    option.compile_options = {
        "area_sum": int(area_sum),
        "mask_width": int(mask_width)
    }
    inputs = [
        embed_pixel, embed_center, sigma_center, boxes, box_areas, option
    ]
    output = jt.code(output_shape,
                     output_type,
                     inputs,
                     cuda_header=CUDA_HEADER,
                     cuda_src=CUDA_SRC)
    return output
コード例 #3
0
def concat(arr, dim):
    '''Concat Operator can concat a list of jt Var at a specfic dimension.
    
    * [in] x:   input var list for concat

    * [in] dim: concat which dim

    * [out] out:  concat result

Example::

        jt.concat([jt.array([[1],[2]]), jt.array([[2],[2]])], dim=1)
        # return [[1],[2],[2],[2]]
    '''
    # TODO: low performance when concat lots of vars
    total_dim = 0
    if dim < 0: dim += len(arr[0].shape)
    for a in arr:
        total_dim += a.shape[dim]
    cdim = 0
    shape = list(a.shape)
    shape[dim] = total_dim
    s = jt.empty(shape, a.dtype)
    slices = [slice(None)] * len(a.shape)
    for a in arr:
        if a.shape[dim] == 0:
            continue
        slices[dim] = slice(cdim, cdim + a.shape[dim])
        # print(slices, type(a))
        s = s.setitem(tuple(slices), a)
        # s = jt.setitem(s, tuple(slices), a)
        cdim += a.shape[dim]
    return s
コード例 #4
0
    def convert_to_binarymask(self):
        if len(self) > 0:
            masks = jt.stack(
                [p.convert_to_binarymask() for p in self.polygons])
        else:
            size = self.size
            masks = jt.empty([0, size[1], size[0]]).bool()

        return BinaryMaskList(masks, size=self.size)
コード例 #5
0
 def forward_face_index_map(self, faces, face_index_map, weight_map,
                            depth_map, face_inv_map):
     faces_inv = jt.empty(faces.shape)
     return rasterize_cuda.forward_face_index_map(faces, face_index_map,
                                                  weight_map, depth_map,
                                                  face_inv_map, faces_inv,
                                                  self.image_size,
                                                  self.near, self.far,
                                                  int(self.return_rgb),
                                                  int(self.return_alpha),
                                                  int(self.return_depth))
コード例 #6
0
ファイル: ops.py プロジェクト: vesple/PointCloudLib
 def execute(self, x_q, x_r): # n_points, c_dim
     batch_size, c_dim, q_points = x_q.shape 
     batch_size, c_dim, r_points = x_r.shape 
     out_idx_shapes = [batch_size, self.k, q_points]
     tmp_dist = jt.empty((batch_size, r_points, q_points), "float32")
     idxs,  = jt.code(
         [out_idx_shapes],
         ['int32'],
         [x_r, x_q, tmp_dist], # in0 r point in1 q point 
         cuda_src=self.cuda_src,
         cuda_header=self.cuda_inc,
     )
     return idxs
コード例 #7
0
def concat(arr, dim=0):
    '''Concat Operator can concat a list of jt Var at a specfic dimension.
    
    * [in] x:   input var list for concat

    * [in] dim: concat which dim

    * return:  concat result

Example::

        jt.concat([jt.array([[1],[2]]), jt.array([[2],[2]])], dim=1)
        # return [[1],[2],[2],[2]]
    '''
    if not isinstance(arr, Sequence):
        raise TypeError("concat arr needs to be a tuple or list")
    if len(arr) == 0:
        raise ValueError("need at least one array to concat")
    total_dim = 0
    if dim < 0: dim += len(arr[0].shape)
    dtypes = []
    for a in arr:
        total_dim += a.shape[dim]
        dtypes.append(str(a.dtype))
    cdim = 0
    shape = list(a.shape)
    shape[dim] = total_dim
    s = jt.empty(shape, dtype=_merge_dtypes(dtypes))
    slices = [slice(None)] * len(a.shape)
    for a in arr:
        if a.shape[dim] == 0:
            continue
        slices[dim] = slice(cdim, cdim + a.shape[dim])
        # print(slices, type(a))
        s = s.setitem(tuple(slices), a)
        # s = jt.setitem(s, tuple(slices), a)
        cdim += a.shape[dim]
    return s
コード例 #8
0
 def execute(self, x, new_shape):
     self.save_vars = x.shape
     return jt.empty(new_shape, str(x.dtype))
コード例 #9
0
    def execute(self, faces, textures):
        self.batch_size, self.num_faces = faces.shape[:2]

        if self.return_rgb:
            self.texture_size = textures.shape[2]
        else:
            # initializing with dummy values
            textures = jt.array([0]).float32()
            self.texture_size = None

        face_index_map = jt.empty(
            (self.batch_size, self.image_size, self.image_size)).int()

        weight_map = jt.empty(
            (self.batch_size, self.image_size, self.image_size, 3))

        depth_map = jt.empty(
            (self.batch_size, self.image_size, self.image_size)) * self.far

        if self.return_rgb:
            rgb_map = jt.empty((self.batch_size, self.image_size,
                                self.image_size, 3)).float()
            sampling_index_map = jt.empty(
                (self.batch_size, self.image_size, self.image_size, 8)).int()
            sampling_weight_map = jt.empty(
                (self.batch_size, self.image_size, self.image_size, 8))
        else:
            rgb_map = jt.zeros(1)
            sampling_index_map = jt.zeros(1).int()
            sampling_weight_map = jt.zeros(1)

        if self.return_alpha:
            alpha_map = jt.empty(
                (self.batch_size, self.image_size, self.image_size))
        else:
            alpha_map = jt.zeros(1)

        if self.return_depth:
            face_inv_map = jt.empty(
                (self.batch_size, self.image_size, self.image_size, 3, 3))
        else:
            face_inv_map = jt.zeros(1)

        # faces -> face_index_map, weight_map, depth_map, face_inv_map
        face_index_map, weight_map, depth_map, face_inv_map = self.forward_face_index_map(
            faces, face_index_map, weight_map, depth_map, face_inv_map)

        # faces, textures, face_index_map, weight_map, depth_map -> rgb_map, sampling_index_map, sampling_weight_map
        rgb_map, sampling_index_map, sampling_weight_map = self.forward_texture_sampling(
            faces, textures, face_index_map, weight_map, depth_map, rgb_map,
            sampling_index_map, sampling_weight_map)

        rgb_map = self.forward_background(face_index_map, rgb_map)

        alpha_map = self.forward_alpha_map(alpha_map, face_index_map)

        self.save_vars = faces, textures, face_index_map, weight_map, depth_map, rgb_map, alpha_map, face_inv_map, sampling_index_map, sampling_weight_map

        rgb_r, alpha_r, depth_r = jt.array([]), jt.array([]), jt.array([])
        if self.return_rgb:
            rgb_r = rgb_map
        if self.return_alpha:
            alpha_r = alpha_map
        if self.return_depth:
            depth_r = depth_map
        return rgb_r, alpha_r, depth_r
コード例 #10
0
    def filter_results(self, boxlist, num_classes):
        """Returns bounding-box detection results by thresholding on scores and
        applying non-maximum suppression (NMS).
        """
        # unwrap the boxlist to avoid additional overhead.
        # if we had multi-class NMS, we could perform this directly on the boxlist
        boxes = boxlist.bbox.reshape(-1, num_classes * 4)
        scores = boxlist.get_field("scores").reshape(-1, num_classes)

        result = []
        # Apply threshold on detection probabilities and apply NMS
        # Skip j = 0, because it's the background class
        # inds_all = (scores > self.score_thresh).int()
        inds_all = scores > self.score_thresh
        # print(self.score_thresh,num_classes)
        # print(inds_all.shape)
        # inds_all = inds_all.transpose(1,0)
        inds_nonzeros = [ inds_all[:,j].nonzero() for j in range(1, num_classes) ]
        jt.sync(inds_nonzeros)

        for j in range(1, num_classes):
            # with nvtx_scope("aa"):
            #     inds = inds_all[:,j].nonzero().squeeze(1)
                
            # with nvtx_scope("bb"):
            #     scores_j = scores[inds, j]
            #     boxes_j = boxes[inds, j * 4 : (j + 1) * 4]
            # with nvtx_scope("cc"):
            #     boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy")
            # with nvtx_scope("cc2"):
            #     boxlist_for_class.add_field("scores", scores_j)
            # with nvtx_scope("cc3"):
            #     boxlist_for_class = boxlist_nms(
            #         boxlist_for_class, self.nms
            #     )
            # with nvtx_scope("dd"):
            #     num_labels = len(boxlist_for_class)
            # with nvtx_scope("dd2"):
            #     boxlist_for_class.add_field(
            #         "labels", jt.full((num_labels,), j).int32()
            #     )
            #     result.append(boxlist_for_class)

            # inds = inds_all[:,j].nonzero().squeeze(1)
            inds = inds_nonzeros[j-1]
            if inds.shape[0] == 0:
                continue
            inds = inds.squeeze(1)
            scores_j = scores[inds, j]
            boxes_j = boxes[inds, j * 4 : (j + 1) * 4]
            boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy")
            boxlist_for_class.add_field("scores", scores_j)
            boxlist_for_class = boxlist_nms(
                    boxlist_for_class, self.nms
                )
            num_labels = len(boxlist_for_class)
            # print(j,num_labels)

            boxlist_for_class.add_field(
                    "labels", jt.full((num_labels,), j).int32()
                )
            result.append(boxlist_for_class)

        result = cat_boxlist(result)
        if not result.has_field('labels'):
            result.add_field('labels',jt.empty((0,)))
        if not result.has_field('scores'):
            result.add_field('scores',jt.empty((0,)))
        number_of_detections = len(result)

        #Limit to max_per_image detections **over all classes**
        if number_of_detections > self.detections_per_img > 0:
            cls_scores = result.get_field("scores")
            image_thresh, _ = jt.kthvalue(
                cls_scores, number_of_detections - self.detections_per_img + 1
            )
            keep = cls_scores >= image_thresh
            keep = jt.nonzero(keep).squeeze(1)
            result = result[keep]
        # # Absolute limit detection imgs
        # if number_of_detections > self.detections_per_img > 0:
        #     cls_scores = result.get_field("scores")
        #     scores, indices = jt.topk(
        #         cls_scores, self.detections_per_img
        #     )
        #     result = result[indices]
        return result
コード例 #11
0
ファイル: rasterize.py プロジェクト: shuiguoli/jrender
def forward_face_index_map(faces, face_index_map, weight_map, depth_map,
                           face_inv_map, faces_inv, image_size, near, far,
                           return_rgb, return_alpha, return_depth):
    lock = jt.empty(depth_map.shape, 'int')
    return jt.code([
        face_index_map.shape, weight_map.shape, depth_map.shape,
        face_inv_map.shape
    ], [
        face_index_map.dtype, weight_map.dtype, depth_map.dtype,
        face_inv_map.dtype
    ], [faces, faces_inv, lock],
                   cuda_header='''

#include <cuda.h>
#include <cuda_runtime.h>
#include <cassert>

#include <thrust/device_ptr.h>
#include <thrust/fill.h>

// for the older gpus atomicAdd with double arguments does not exist
#if  __CUDA_ARCH__ < 600 and defined(__CUDA_ARCH__)
static __inline__ __device__ double atomicAdd(double* address, double val) {
    unsigned long long int* address_as_ull = (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed,
                __double_as_longlong(val + __longlong_as_double(assumed)));
    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
    } while (assumed != old);
    return __longlong_as_double(old);
}
#endif

namespace{
template <typename scalar_t,
        int image_size,
        int return_rgb,
        int return_alpha,
        int return_depth>
__global__ void forward_face_index_map_cuda_kernel(
        const scalar_t* faces,
        scalar_t* faces_inv,
        int32_t*  face_index_map,
        scalar_t*  weight_map,
        scalar_t*  depth_map,
        scalar_t*  face_inv_map,
        int batch_size,
        int num_faces,
        scalar_t near,
        scalar_t far,
        int threads_n_bits,
        int32_t* dev_tilemutex) {
    /* batch number, face, number, image size, face[v012][RGB] */
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= batch_size * (1<<threads_n_bits)) {
        return;
    }
    const int fn = i & ((1<<threads_n_bits) - 1);
     if (fn >= num_faces)
        return;
    const int bn = i>>threads_n_bits;
    i = bn * num_faces + fn;
    const int is = image_size;
    const scalar_t* face = &faces[i * 9];
    scalar_t* face_inv_g = &faces_inv[i * 9];

    /* return if backside */
    if ((face[7] - face[1]) * (face[3] - face[0]) < (face[4] - face[1]) * (face[6] - face[0]))
        return;

    /* p[num][xy]: x, y is normalized from [-1, 1] to [0, is - 1]. */
    scalar_t p[3][2];
    for (int num = 0; num < 3; num++) {
        for (int dim = 0; dim < 2; dim++) {
            p[num][dim] = 0.5 * (face[3 * num + dim] * is + is - 1);
        }
    }

    /* compute face_inv */
    scalar_t face_inv[9] = {
        p[1][1] - p[2][1], p[2][0] - p[1][0], p[1][0] * p[2][1] - p[2][0] * p[1][1],
        p[2][1] - p[0][1], p[0][0] - p[2][0], p[2][0] * p[0][1] - p[0][0] * p[2][1],
        p[0][1] - p[1][1], p[1][0] - p[0][0], p[0][0] * p[1][1] - p[1][0] * p[0][1]};
    scalar_t face_inv_denominator = (
        p[2][0] * (p[0][1] - p[1][1]) +
        p[0][0] * (p[1][1] - p[2][1]) +
        p[1][0] * (p[2][1] - p[0][1]));
    /* set to global memory */
    for (int k = 0; k < 9; k++) {
        face_inv[k] /= face_inv_denominator;
        face_inv_g[k] = face_inv[k];
    }

    /* compute the bounding box of triangle facet */
    scalar_t x_min=is, y_min=is, x_max=0, y_max=0;
    for (int num = 0; num < 3; num++) {
        if (p[num][0] < x_min)
            x_min = p[num][0];
        if (p[num][0] > x_max)
            x_max = p[num][0];
        if (p[num][1] < y_min)
            y_min = p[num][1];
        if (p[num][1] > y_max)
            y_max = p[num][1];
    }

    int ix_min = max(0, (int)x_min);
    int ix_max = min(is-1, (int)x_max);
    int iy_min = max(0, (int)y_min);
    int iy_max = min(is-1, (int)y_max);

    /* traverse each pixel in the bounding box */
    for (int xi=ix_min;xi<=ix_max;xi++) {
        for (int yi=iy_min;yi<=iy_max;yi++) {
            const scalar_t yp = (2. * yi + 1 - is) / is;
            const scalar_t xp = (2. * xi + 1 - is) / is;
            /* check [py, px] is inside the face */
            if (((yp - face[1]) * (face[3] - face[0]) < (xp - face[0]) * (face[4] - face[1])) ||
                ((yp - face[4]) * (face[6] - face[3]) < (xp - face[3]) * (face[7] - face[4])) ||
                ((yp - face[7]) * (face[0] - face[6]) < (xp - face[6]) * (face[1] - face[7])))
                continue;

            int i1 = bn * is * is + yi * is + xi;
            /* compute w = face_inv * p */
            scalar_t w[3];
            w[0] = face_inv[3 * 0 + 0] * xi + face_inv[3 * 0 + 1] * yi + face_inv[3 * 0 + 2];
            w[1] = face_inv[3 * 1 + 0] * xi + face_inv[3 * 1 + 1] * yi + face_inv[3 * 1 + 2];
            w[2] = face_inv[3 * 2 + 0] * xi + face_inv[3 * 2 + 1] * yi + face_inv[3 * 2 + 2];

            /* sum(w) -> 1, 0 < w < 1 */
            scalar_t w_sum = 0;
            for (int k = 0; k < 3; k++) {
                w[k] = min(max(w[k], 0.), 1.);
                w_sum += w[k];
            }
            for (int k = 0; k < 3; k++) {
                w[k] /= w_sum;
            }
            /* compute 1 / zp = sum(w / z) */
            const scalar_t zp = 1. / (w[0] / face[2] + w[1] / face[5] + w[2] / face[8]);
            if (zp <= near || far <= zp) {
                continue;
            }
            /* check z-buffer */
            bool isSet;
            do
            {
                isSet = (atomicCAS(&dev_tilemutex[i1], 0, 1) == 0);
                if (isSet)
                {
                    if (zp < depth_map[i1]) {
                        depth_map[i1] = zp;
                        face_index_map[i1] = fn;
                        for (int k = 0; k < 3; k++) {
                            weight_map[3 * i1 + k] = w[k];
                        }
                        if (return_depth) {
                            for (int k = 0; k < 9; k++) {
                                face_inv_map[9 * i1 + k] = face_inv[k];
                            }
                        }
                    }
                    __threadfence();
                    dev_tilemutex[i1] = 0;
                }
            } while (!isSet);
        }
    }
}

}
    ''',
                   cuda_src=f'''
    @alias(faces, in0)
    @alias(faces_inv, in1)
    @alias(face_index_map, out0)
    @alias(weight_map, out1)
    @alias(depth_map, out2)
    @alias(face_inv_map, out3)

    thrust::device_ptr<out0_type> dev_ptr0(out0_p);
    thrust::fill(dev_ptr0, dev_ptr0 + out0->num, -1);

    cudaMemsetAsync(out1_p, 0, out1->size);

    thrust::device_ptr<out2_type> dev_ptr2(out2_p);
    thrust::fill(dev_ptr2, dev_ptr2 + out2->num, {far});

    cudaMemsetAsync(out3_p, 0, out3->size);

    const auto batch_size = faces_shape0;
    const auto num_faces = faces_shape1;
    const int threads = 256;
    const int threads_n_bits = NanoVector::get_nbits(num_faces) - 1;
    const dim3 blocks_1 ((batch_size * (1<<threads_n_bits) - 1) / threads +1);

    cudaMemsetAsync(in2_p, 0, in2->size);

    forward_face_index_map_cuda_kernel<
        float32,
        (int) {image_size},
        {return_rgb},
        {return_alpha},
        {return_depth}  
    ><<<blocks_1, threads>>>(
        faces_p,
        faces_inv_p,
        face_index_map_p,
        weight_map_p,
        depth_map_p,
        face_inv_map_p,
        (int) batch_size,
        (int) num_faces,
        (float32) {near},
        (float32) {far},
        threads_n_bits,
        in2_p);

    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) 
            printf("Error in forward_face_index_map: %s\\n", cudaGetErrorString(err));
    ''')
コード例 #12
0
def knn(unknown, known, k):
    b, n, c = unknown.shape
    _, m, _ = known.shape
    # dists2 = jt.ones((b, n, k), dtype="float") * 1e40
    # idx = jt.zeros((b, n, k), dtype="int")
    dists2 = jt.empty((b, n, k), dtype="float")
    idx = jt.empty((b, n, k), dtype="int")
    return jt.code([unknown, known], [dists2, idx],
                   cuda_header='''
    #define TOTAL_THREADS 512
    #define K %s

    namespace {
        inline int opt_n_threads(int work_size) {
            const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
            return max(min(1 << pow_2, TOTAL_THREADS), 1);
        }

        __global__ void three_nn_kernel(int b, int n, int m,
                                const float *__restrict__ unknown,
                                const float *__restrict__ known,
                                float *__restrict__ dist2,
                                int *__restrict__ idx) {

            int batch_index = blockIdx.x;
            unknown += batch_index * n * 3;
            known += batch_index * m * 3;
            dist2 += batch_index * n * K;
            idx += batch_index * n * K;

            int index = threadIdx.x;
            int stride = blockDim.x;
            for (int j = index; j < n; j += stride) {
                float ux = unknown[j * 3 + 0];
                float uy = unknown[j * 3 + 1];
                float uz = unknown[j * 3 + 2];

                float tmp_dist[K];
                int tmp_idx[K];
                #pragma unroll
                for (int i=0; i<K; i++) tmp_dist[i] = 1e30;
                for (int k = 0; k < m; ++k) {
                    float x = known[k * 3 + 0];
                    float y = known[k * 3 + 1];
                    float z = known[k * 3 + 2];
                    float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);

                    int first = -1;
                    #pragma unroll
                    for (int i=0; i<K; i++)
                        if (first == -1 && d<tmp_dist[i])
                            first = i;
                    if (first == -1) continue;
                    #pragma unroll
                    for (int i=0; i<K; i++)
                        if (K-1-i > first) {
                            tmp_dist[K-1-i] = tmp_dist[K-2-i];
                            tmp_idx[K-1-i] = tmp_idx[K-2-i];
                        }
                    tmp_dist[first] = d;
                    tmp_idx[first] = k;
                    /*
                    for (int l = 0; l < K; ++l) {
                        if (d < dist2[j * K + l]) {
                            for (int m = K-1; m > l; --m) {
                                dist2[j * K + m] = dist2[j * K + m - 1];
                                idx[j * K + m] = idx[j * K + m - 1];
                            }
                            dist2[j * K + l] = d;
                            idx[j * K + l] = k;
                            break;
                        }
                    }
                    */
                }
                #pragma unroll
                for (int i=0; i<K; i++) {
                    dist2[j * K + i] = tmp_dist[i];
                    idx[j * K + i] = tmp_idx[i];
                }
            }
        }
    }
    ''' % k,
                   cuda_src=f'''
    @alias(unknown, in0)
    @alias(known, in1)
    @alias(dists2, out0)
    @alias(idx, out1)

    three_nn_kernel<<<{b}, opt_n_threads({n}), 0, 0>>>({b}, {n}, {m}, unknown_p, known_p, dists2_p, idx_p);
    ''')
コード例 #13
0
def three_nn(unknown, known):
    b, n, c = unknown.shape
    _, m, _ = known.shape
    dists2 = jt.empty((b, n, 3), dtype="float")
    idx = jt.empty((b, n, 3), dtype="int")
    return jt.code([unknown, known], [dists2, idx],
                   cuda_header='''
    #define TOTAL_THREADS 512

    namespace {
        inline int opt_n_threads(int work_size) {
            const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
            return max(min(1 << pow_2, TOTAL_THREADS), 1);
        }

        __global__ void three_nn_kernel(int b, int n, int m,
                                const float *__restrict__ unknown,
                                const float *__restrict__ known,
                                float *__restrict__ dist2,
                                int *__restrict__ idx) {

            int batch_index = blockIdx.x;
            unknown += batch_index * n * 3;
            known += batch_index * m * 3;
            dist2 += batch_index * n * 3;
            idx += batch_index * n * 3;

            int index = threadIdx.x;
            int stride = blockDim.x;
            for (int j = index; j < n; j += stride) {
                float ux = unknown[j * 3 + 0];
                float uy = unknown[j * 3 + 1];
                float uz = unknown[j * 3 + 2];

                double best1 = 1e40, best2 = 1e40, best3 = 1e40;
                int besti1 = 0, besti2 = 0, besti3 = 0;
                for (int k = 0; k < m; ++k) {
                float x = known[k * 3 + 0];
                float y = known[k * 3 + 1];
                float z = known[k * 3 + 2];
                float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
                if (d < best1) {
                    best3 = best2;
                    besti3 = besti2;
                    best2 = best1;
                    besti2 = besti1;
                    best1 = d;
                    besti1 = k;
                } else if (d < best2) {
                    best3 = best2;
                    besti3 = besti2;
                    best2 = d;
                    besti2 = k;
                } else if (d < best3) {
                    best3 = d;
                    besti3 = k;
                }
                }
                dist2[j * 3 + 0] = best1;
                dist2[j * 3 + 1] = best2;
                dist2[j * 3 + 2] = best3;

                idx[j * 3 + 0] = besti1;
                idx[j * 3 + 1] = besti2;
                idx[j * 3 + 2] = besti3;
            }
        }
    }
    ''',
                   cuda_src=f'''
    @alias(unknown, in0)
    @alias(known, in1)
    @alias(dists2, out0)
    @alias(idx, out1)

    three_nn_kernel<<<{b}, opt_n_threads({n}), 0, 0>>>({b}, {n}, {m}, unknown_p, known_p, dists2_p, idx_p);
    ''')
コード例 #14
0
ファイル: multibox_loss.py プロジェクト: li-xl/Yolact.jittor
    def execute(self, net, predictions, targets, masks, num_crowds):
        """Multibox Loss
        Args:
            predictions (tuple): A tuple containing loc preds, conf preds,
            mask preds, and prior boxes from SSD net.
                loc shape: jt.size(batch_size,num_priors,4)
                conf shape: jt.size(batch_size,num_priors,num_classes)
                masks shape: jt.size(batch_size,num_priors,mask_dim)
                priors shape: jt.size(num_priors,4)
                proto* shape: jt.size(batch_size,mask_h,mask_w,mask_dim)

            targets (list<tensor>): Ground truth boxes and labels for a batch,
                shape: [batch_size][num_objs,5] (last idx is the label).

            masks (list<tensor>): Ground truth masks for each object in each image,
                shape: [batch_size][num_objs,im_height,im_width]

            num_crowds (list<int>): Number of crowd annotations per batch. The crowd
                annotations should be the last num_crowds elements of targets and masks.
            
            * Only if mask_type == lincomb
        """

        loc_data  = predictions['loc']
        conf_data = predictions['conf']
        mask_data = predictions['mask']
        priors    = predictions['priors']

        if cfg.mask_type == mask_type.lincomb:
            proto_data = predictions['proto']

        score_data = predictions['score'] if cfg.use_mask_scoring   else None   
        inst_data  = predictions['inst']  if cfg.use_instance_coeff else None
        
        labels = [None] * len(targets) # Used in sem segm loss

        batch_size = loc_data.shape[0]
        num_priors = priors.shape[0]
        num_classes = self.num_classes

        # Match priors (default boxes) and ground truth boxes
        # These tensors will be created with the same device as loc_data
        loc_t = jt.empty((batch_size, num_priors, 4),dtype=loc_data.dtype)
        gt_box_t = jt.empty((batch_size, num_priors, 4),dtype=loc_data.dtype)
        conf_t = jt.empty((batch_size, num_priors)).int32()
        idx_t = jt.empty((batch_size, num_priors)).int32()

        if cfg.use_class_existence_loss:
            class_existence_t = jt.empty((batch_size, num_classes-1),dtype=loc_data.dtype)

        # jt.sync(list(predictions.values()))

        for idx in range(batch_size):
            truths      = targets[idx][:, :-1]
            labels[idx] = targets[idx][:, -1].int32()

            if cfg.use_class_existence_loss:
                # Construct a one-hot vector for each object and collapse it into an existence vector with max
                # Also it's fine to include the crowd annotations here
                class_existence_t[idx,:] = jt.eye(num_classes-1)[labels[idx]].max(dim=0)[0]

            # Split the crowd annotations because they come bundled in
            cur_crowds = num_crowds[idx]
            if cur_crowds > 0:
                split = lambda x: (x[-cur_crowds:], x[:-cur_crowds])
                crowd_boxes, truths = split(truths)

                # We don't use the crowd labels or masks
                _, labels[idx] = split(labels[idx])
                _, masks[idx]  = split(masks[idx])
            else:
                crowd_boxes = None

            
            match(self.pos_threshold, self.neg_threshold,
                  truths, priors, labels[idx], crowd_boxes,
                  loc_t, conf_t, idx_t, idx, loc_data[idx])
                  
            gt_box_t[idx,:,:] = truths[idx_t[idx]]

        # wrap targets
        loc_t.stop_grad()
        conf_t.stop_grad()
        idx_t.stop_grad()

        pos = conf_t > 0
        num_pos = pos.sum(dim=1, keepdims=True)
        
        # Shape: [batch,num_priors,4]
        pos_idx = pos.unsqueeze(pos.ndim).expand_as(loc_data)
        
        losses = {}

        # Localization Loss (Smooth L1)
        if cfg.train_boxes:
            loc_p = loc_data[pos_idx].view(-1, 4)
            loc_t = loc_t[pos_idx].view(-1, 4)
            # print(loc_t)
            losses['B'] = nn.smooth_l1_loss(loc_p, loc_t, reduction='sum') * cfg.bbox_alpha

        if cfg.train_masks:
            if cfg.mask_type == mask_type.direct:
                if cfg.use_gt_bboxes:
                    pos_masks = []
                    for idx in range(batch_size):
                        pos_masks.append(masks[idx][idx_t[idx, pos[idx]]])
                    masks_t = jt.contrib.concat(pos_masks, 0)
                    masks_p = mask_data[pos, :].view(-1, cfg.mask_dim)
                    losses['M'] = nn.bce_loss(jt.clamp(masks_p, 0, 1), masks_t, size_average=False) * cfg.mask_alpha
                else:
                    losses['M'] = self.direct_mask_loss(pos_idx, idx_t, loc_data, mask_data, priors, masks)
            elif cfg.mask_type == mask_type.lincomb:
                ret = self.lincomb_mask_loss(pos, idx_t, loc_data, mask_data, priors, proto_data, masks, gt_box_t, score_data, inst_data, labels)
                if cfg.use_maskiou:
                    loss, maskiou_targets = ret
                else:
                    loss = ret
                losses.update(loss)

                if cfg.mask_proto_loss is not None:
                    if cfg.mask_proto_loss == 'l1':
                        losses['P'] = jt.mean(jt.abs(proto_data)) / self.l1_expected_area * self.l1_alpha
                    elif cfg.mask_proto_loss == 'disj':
                        losses['P'] = -jt.mean(jt.max(nn.log_softmax(proto_data, dim=-1), dim=-1)[0])

        # Confidence loss
        if cfg.use_focal_loss:
            if cfg.use_sigmoid_focal_loss:
                losses['C'] = self.focal_conf_sigmoid_loss(conf_data, conf_t)
            elif cfg.use_objectness_score:
                losses['C'] = self.focal_conf_objectness_loss(conf_data, conf_t)
            else:
                losses['C'] = self.focal_conf_loss(conf_data, conf_t)
        else:
            if cfg.use_objectness_score:
                losses['C'] = self.conf_objectness_loss(conf_data, conf_t, batch_size, loc_p, loc_t, priors)
            else:
                losses['C'] = self.ohem_conf_loss(conf_data, conf_t, pos, batch_size)

        # Mask IoU Loss
        if cfg.use_maskiou and maskiou_targets is not None:
            losses['I'] = self.mask_iou_loss(net, maskiou_targets)

        # These losses also don't depend on anchors
        if cfg.use_class_existence_loss:
            losses['E'] = self.class_existence_loss(predictions['classes'], class_existence_t)
        if cfg.use_semantic_segmentation_loss:
            losses['S'] = self.semantic_segmentation_loss(predictions['segm'], masks, labels)

        # Divide all losses by the number of positives.
        # Don't do it for loss[P] because that doesn't depend on the anchors.
        total_num_pos = num_pos.sum().float()
        for k in losses:
            if k not in ('P', 'E', 'S'):
                losses[k] /= total_num_pos
            else:
                losses[k] /= batch_size

        # Loss Key:
        #  - B: Box Localization Loss
        #  - C: Class Confidence Loss
        #  - M: Mask Loss
        #  - P: Prototype Loss
        #  - D: Coefficient Diversity Loss
        #  - E: Class Existence Loss
        #  - S: Semantic Segmentation Loss
        return losses
コード例 #15
0
    def __init__(self, masks, size):
        """
            Arguments:
                masks: Either jt.array of [num_instances, H, W]
                    or list of jt.arrays of [H, W] with num_instances elems,
                    or RLE (Run Length Encoding) - interpreted as list of dicts,
                    or BinaryMaskList.
                size: absolute image size, width first

            After initialization, a hard copy will be made, to leave the
            initializing source data intact.
        """

        assert isinstance(size, (list, tuple))
        assert len(size) == 2

        if isinstance(masks, jt.Var):
            # The raw data representation is passed as argument
            masks = masks.clone()
        elif isinstance(masks, (list, tuple)):
            if len(masks) == 0:
                masks = jt.empty([0, size[1], size[0]])  # num_instances = 0!
            elif isinstance(masks[0], jt.Var):
                masks = jt.stack(masks, dim=0).clone()
            elif isinstance(masks[0], dict) and "counts" in masks[0]:
                if (isinstance(masks[0]["counts"], (list, tuple))):
                    masks = mask_utils.frPyObjects(masks, size[1], size[0])
                # RLE interpretation
                rle_sizes = [tuple(inst["size"]) for inst in masks]

                masks = mask_utils.decode(masks)  # [h, w, n]
                masks = jt.array(masks).transpose(2, 0, 1)  # [n, h, w]

                assert rle_sizes.count(rle_sizes[0]) == len(rle_sizes), (
                    "All the sizes must be the same size: %s" % rle_sizes)

                # in RLE, height come first in "size"
                rle_height, rle_width = rle_sizes[0]
                assert masks.shape[1] == rle_height
                assert masks.shape[2] == rle_width

                width, height = size
                if width != rle_width or height != rle_height:
                    masks = interpolate(
                        input=masks.unsqueeze(0).float(),
                        size=(height, width),
                        mode="bilinear",
                        align_corners=False,
                    )[0].type_as(masks)
            else:
                RuntimeError(
                    "Type of `masks[0]` could not be interpreted: %s" %
                    type(masks))
        elif isinstance(masks, BinaryMaskList):
            # just hard copy the BinaryMaskList instance's underlying data
            masks = masks.masks.clone()
        else:
            RuntimeError(
                "Type of `masks` argument could not be interpreted:%s" %
                type(masks))

        if len(masks.shape) == 2:
            # if only a single instance mask is passed
            masks = masks.unsqueeze(0)

        assert len(masks.shape) == 3
        assert masks.shape[1] == size[1], "%s != %s" % (masks.shape[1],
                                                        size[1])
        assert masks.shape[2] == size[0], "%s != %s" % (masks.shape[2],
                                                        size[0])

        self.masks = masks
        self.size = tuple(size)