def execute(self, pc1, pc2, reduction='mean', dims='BNC'): assert dims in ['BNC', 'BCN'] if dims == 'BCN': pc1, pc2 = pc1.permute(0, 2, 1), pc2.permute(0, 2, 1) batch_size_1, N, _ = pc1.shape batch_size_2, M, _ = pc2.shape assert batch_size_1 == batch_size_2 batch_size = batch_size_1 temp = jt.zeros([batch_size, (N + M) * 2], pc1.dtype) match = jt.code( shape=[batch_size, M, N], dtype=pc1.dtype, inputs=[pc1, pc2, temp], cuda_header=EMD_gpu_header, cuda_src=approxmatch_gpu_src, ) emd = jt.code( shape=[batch_size], dtype=pc1.dtype, inputs=[pc1, pc2, match], cuda_header=EMD_gpu_header, cuda_src=matchcost_gpu_src, ) self.saved_vars = (pc1, pc2, match, reduction) if reduction is None: return emd elif reduction == 'sum': return emd.sum() elif reduction == 'mean': return emd.mean()
def test_error_msg_trace_py_var(self): a = jt.array([3, 2, 1]) b = jt.code(a.shape, a.dtype, [a], cpu_header=""" #include <algorithm> @alias(a, in0) @alias(b, out) """, cpu_src=""" for (int i=0; i<a_shape0; i++) @b(i) = @a(i); std::sort(&@b(0), &@b(in0_shape0)); throw std::runtime_error("???"); """) msg = "" try: print(b) except Exception as e: msg = str(e) print(msg) assert "[Reason]: ???" in msg assert "[Input]: int32[3,]" in msg assert "[OP TYPE]: code" in msg assert "[Async Backtrace]:" in msg assert "test_error_msg.py:" in msg
def mask_prob_cuda(embed_pixel, embed_center, sigma_center, boxes, box_areas, area_sum, mask_width): assert embed_pixel.ndim == 2, "embed_pixel should be MxDim" assert embed_center.ndim == 2, "embed_center should be NxDim" assert sigma_center.ndim == 1, "sigma_center should be N" assert embed_pixel.shape[1] == embed_center.shape[1], "Dim should the same" assert embed_center.shape[0] == sigma_center.shape[ 0], "center number should be the same" assert embed_center.shape[0] == boxes.shape[ 0], "center number and box number should be the same" output_shape = (embed_pixel.shape[0], embed_center.shape[0]) if output_shape[0] * output_shape[1] == 0: return jt.array([], embed_pixel.dtype) output_type = embed_pixel.dtype option = jt.empty((0, )) option.compile_options = { "area_sum": int(area_sum), "mask_width": int(mask_width) } inputs = [ embed_pixel, embed_center, sigma_center, boxes, box_areas, option ] output = jt.code(output_shape, output_type, inputs, cuda_header=CUDA_HEADER, cuda_src=CUDA_SRC) return output
def execute(self, x): ''' Parameters ---------- x: jt.Var, (B, N, 3) Returns ------- y: jt.Var, (B, n_samples, 3) ''' batch_size, n_points, n_coords = x.shape assert self.n_samples <= n_points assert n_coords == 3 assert x.dtype == 'float32' block_size = optimal_block(batch_size) cuda_src = self.cuda_src.replace('#block_size', str(block_size)) idxs_shape = [batch_size, self.n_samples] idxs = jt.code(idxs_shape, 'int32', [x,], cuda_src=cuda_src) y = x.reindex([batch_size, self.n_samples, 3], [ 'i0', # Batchid '@e0(i0, i1)', # Nid 'i2' ], extras=[idxs]) return y
def grad(self, grad_x): x = self.save_vars return jt.code(x.shape, x.dtype, [x, grad_x], cpu_src=''' for (int i=0; i<in0_shape0; i++) @out(i) = @in1(i)*@in0(i)*4; ''')
def test_type(self): import numpy as np assert str(jt.NanoString(float)) == "float" assert str(jt.NanoString(np.float)) == "float" assert str(jt.NanoString(np.float32)) == "float32" assert str(jt.NanoString(np.float64)) == "float64" assert str(jt.NanoString(np.int8)) == "int8" assert str(jt.NanoString(np.array([1, 2, 3]).dtype)) == "int64" assert str(jt.NanoString(jt.float)) == "float" assert str(jt.NanoString(jt.float32)) == "float32" assert str(jt.NanoString(jt.float64)) == "float64" assert str(jt.NanoString(jt.int8)) == "int8" assert str(jt.NanoString(jt.array([1, 2, 3]).dtype)) == "int64" assert str(jt.NanoString(jt.sum)) == "add" def get_error_str(call): es = "" try: call() except Exception as e: es = str(e) return es e = get_error_str(lambda: jt.code([ 1, ], {}, [1], cpu_header="")) assert "help(jt.ops.code)" in e assert "cpu_header=str" in e e = get_error_str(lambda: jt.NanoString([1, 2, 3], f**k=1)) assert "f**k=int" in str(e) assert "(list, )" in str(e)
def execute(self, x): self.save_vars = x return jt.code(x.shape, x.dtype, [x], cpu_src=''' for (int i=0; i<in0_shape0; i++) @out(i) = @in0(i)*@in0(i)*2; ''')
def chamfer_loss(pc1, pc2, reduction='mean', sqrt=True): ''' return the chamfer loss from pc1 to pc2. Parameters: =========== pc1: [B, N, xyz] pc2: [B, N, xyz] reduction: 'mean', 'sum', or None ''' batch_size_1, n_samples_pc1, _ = pc1.shape batch_size_2, n_samples_pc2, _ = pc2.shape assert batch_size_1 == batch_size_2 batch_size = batch_size_1 idx = jt.code([batch_size, n_samples_pc1], 'int32', [pc1, pc2], cpu_src=cpu_src, cuda_src=cuda_src) nearest_pts = select_vertices(pc2, idx) if sqrt: chamfer_distance = (((pc1 - nearest_pts)**2).sum(dim=-1)).sqrt() else: chamfer_distance = (((pc1 - nearest_pts)**2).sum(dim=-1)) if reduction is None: return chamfer_distance elif reduction == 'sum': return jt.sum(chamfer_distance) elif reduction == 'mean': return jt.mean(chamfer_distance)
def test_cuda2(self): a = jt.random((100,100)) b = jt.random((100,100)) c = jt.code(a.shape, a.dtype, [a,b], cuda_src=''' __global__ static void kernel1(@ARGS_DEF) { @PRECALC for (int i=blockIdx.x; i<in0_shape0; i+=gridDim.x) for (int j=threadIdx.x; j<in0_shape1; j+=blockDim.x) @out(i,j) = @in0(i,j)*@in1(i,j); } kernel1<<<32, 32>>>(@ARGS); ''', cuda_grad_src = [''' __global__ static void kernel(@ARGS_DEF) { @PRECALC for (int i=blockIdx.x; i<in0_shape0; i+=gridDim.x) for (int j=threadIdx.x; j<in0_shape1; j+=blockDim.x) @out(i,j) = @dout(i,j)*@in1(i,j); } kernel<<<32, 32>>>(@ARGS); ''', ''' __global__ static void kernel(@ARGS_DEF) { @PRECALC @pout(0,0); for (int i=blockIdx.x; i<in0_shape0; i+=gridDim.x) for (int j=threadIdx.x; j<in0_shape1; j+=blockDim.x) @out(i,j) = @dout(i,j)*@in0(i,j); } kernel<<<32, 32>>>(@ARGS); ''']) da, db = jt.grad(c, [a, b]) assert np.allclose(c.data, a.data*b.data), (c.data, a.data*b.data) assert np.allclose(da.data, b.data) assert np.allclose(db.data, a.data)
def searchsorted(sorted, values, right=False): """ Find the indices from the innermost dimension of `sorted` for each `values`. Example:: sorted = jt.array([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]]) values = jt.array([[3, 6, 9], [3, 6, 9]]) ret = jt.searchsorted(sorted, values) assert (ret == [[1, 3, 4], [1, 2, 4]]).all(), ret ret = jt.searchsorted(sorted, values, right=True) assert (ret == [[2, 3, 5], [1, 3, 4]]).all(), ret sorted_1d = jt.array([1, 3, 5, 7, 9]) ret = jt.searchsorted(sorted_1d, values) assert (ret == [[1, 3, 4], [1, 3, 4]]).all(), ret """ _searchsorted_header = f""" namespace jittor {{ @python.jittor.auto_parallel(2) inline static void searchsorted( int batch_num, int batch_id, int value_num, int value_id, int sorted_num, int batch_stride, {sorted.dtype}* __restrict__ sort_p, {values.dtype}* __restrict__ value_p, int32* __restrict__ index_p) {{ int32 l = batch_id * batch_stride; int32 r = l + sorted_num; auto v = value_p[batch_id * value_num + value_id]; while (l<r) {{ int32 m = (l+r)/2; if (sort_p[m] {"<=" if right else "<"} v) l = m+1; else r = m; }} index_p[batch_id * value_num + value_id] = l - batch_id * batch_stride; }} }} """ _searchsorted_src = """ int value_num = in1->shape[in1->shape.size()-1]; int sorted_num = in0->shape[in0->shape.size()-1]; int32 batch_num = in0->num / sorted_num; int32 batch_num2 = in1->num / value_num; int32 batch_stride = batch_num == 1 ? 0 : sorted_num; CHECK(batch_num == batch_num2 || batch_num == 1); searchsorted(batch_num2, 0, value_num, 0, sorted_num, batch_stride, in0_p, in1_p, out0_p); """ return jt.code(values.shape, "int32", [sorted, values], cpu_header=_searchsorted_header, cpu_src=_searchsorted_src, cuda_header=_searchsorted_header, cuda_src=_searchsorted_src)
def roi_pool(input,rois,output_size,spatial_scale): output_size = _pair(output_size) spatial_scale = jt.array([spatial_scale]) output_shapes = [(rois.shape[0], input.shape[1], output_size[0], output_size[1])]*2 inputs = [input,rois,spatial_scale] output_types = [input.dtype,'int32'] output,arg_output = jt.code(output_shapes,output_types,inputs,cuda_header=CUDA_HEADER,cuda_src=CUDA_SRC,cuda_grad_src=CUDA_GRAD_SRC) return output
def test_array_migrate(self): with jt.flag_scope(use_cuda=1): a = jt.array(np.float32([1,2,3])) b = jt.code(a.shape, a.dtype, [a], cpu_src=""" for (int i=0; i<in0shape0; i++) @out(i) = @in0(i)*@in0(i)*2; """) assert (b.data==[2,8,18]).all()
def test_parallel(self): a = jt.code([4], "int", cpu_src=""" #pragma omp parallel num_threads(4) @out(omp_get_thread_num()) = 456; """, cpu_header='#include <omp.h>').data assert (a == [456] * 4).all(), a
def test_return_multi_output(self): a = jt.array([3, 2, 1]) b = jt.array([1, 2]) c = jt.array([3, 4, 5, 6]) jt.code([a], [b, c], cpu_src=""" @alias(a, in0) @alias(b, out0) @alias(c, out1) for (int i=0; i<a_shape0; i++) { if (i<b_shape0) @b(i) += @a(i); if (i<c_shape0) @c(i) += @a(i); } """) assert b.shape == [2] assert c.shape == [4] assert (b.data == [4, 4]).all() assert (c.data[:3] == [6, 6, 6]).all()
def chamfer_loss(pc1, pc2, reduction='mean', dims='BNC', bidirectional=False): ''' return the chamfer loss from pc1 to pc2. :param pc1: input point cloud :type pc1: jittor array :param pc2: input point cloud :type pc2: jittor array :param reduction: reduction method in batches, can be 'mean', 'sum', or None. Default: 'mean'. :type reduction: str, optional :param dims: a string that represents each dimension, can be '[BNC]' ([batch, number of points, xyz]), or '[BCN]' ([batch, xyz, number of points]). Default: 'BNC'. :type dims: str, optional Example: >>> import jittor as jt >>> from jittor.loss3d import chamfer_loss >>> jt.flags.use_cuda = True >>> pc1 = jt.rand([10, 100, 3], dtype=jt.float32) >>> pc2 = jt.rand([10, 100, 3], dtype=jt.float32) >>> cf = chamfer_loss(pc1, pc2, dims='BNC', bidirectional=True) >>> print('chamfer loss =', cf.item()) ''' if bidirectional: return chamfer_loss(pc1, pc2, reduction, dims) + chamfer_loss( pc2, pc1, reduction, dims) assert dims in ['BNC', 'BCN'] if dims == 'BCN': pc1, pc2 = pc1.permute(0, 2, 1), pc2.permute(0, 2, 1) batch_size_1, N, _ = pc1.shape batch_size_2, M, _ = pc2.shape assert batch_size_1 == batch_size_2 batch_size = batch_size_1 idx = jt.code([batch_size, N], 'int32', [pc1, pc2], cpu_src=cpu_src, cuda_src=cuda_src) nearest_pts = pc2.reindex([batch_size, idx.shape[1], 3], ['i0', '@e0(i0, i1)', 'i2'], extras=[idx]) chamfer_distance = (((pc1 - nearest_pts)**2).sum(dim=-1)).sqrt() if reduction is None: return chamfer_distance elif reduction == 'sum': return jt.sum(chamfer_distance) elif reduction == 'mean': return jt.mean(chamfer_distance)
def test_device_allocator(self): a = jt.array([1, 2, 3, 4, 5]) b = a + 1 c = jt.code(a.shape, a.dtype, [b], cpu_src=""" for (int i=0; i<in0_shape0; i++) @out(i) = @in0(i)*@in0(i)*2; """) assert (c.data == [8, 18, 32, 50, 72]).all()
def test_header(self): a = jt.array([3, 2, 1]) b = jt.code(a.shape, a.dtype, [a], header='#include <algorithm>', cpu_src=""" for (int i=0; i<in0shape0; i++) @out(i) = @in0(i); std::sort(&@out(0), &@out(in0shape0)); """) assert (b.data == [1, 2, 3]).all()
def execute(self, a, b): self.save_vars = a, b return jt.code(a.shape, a.dtype, [a,b], cuda_src=''' __global__ static void kernel1(@ARGS_DEF) { @PRECALC for (int i=blockIdx.x; i<in0_shape0; i+=gridDim.x) for (int j=threadIdx.x; j<in0_shape1; j+=blockDim.x) @out(i,j) = @in0(i,j)*@in1(i,j); } kernel1<<<32, 32>>>(@ARGS); ''')
def simple_presum(x): src = ''' __inline_static__ @python.jittor.auto_parallel(1) void kernel(int n0, int i0, in0_type* x, in0_type* out, int nl) { out[i0*(nl+1)] = 0; for (int i=0; i<nl; i++) out[i0*(nl+1)+i+1] = out[i0*(nl+1)+i] + x[i0*(nl+1)+i]; } kernel(in0->num/in0->shape[in0->shape.size()-1], 0, in0_p, out0_p, in0->num); ''' return jt.code(x.shape[:-1]+(x.shape[-1]+1,), x.dtype, [x], cpu_src=src, cuda_src=src)
def execute(self, x_q, x_r): # n_points, c_dim batch_size, c_dim, q_points = x_q.shape batch_size, c_dim, r_points = x_r.shape out_idx_shapes = [batch_size, self.k, q_points] tmp_dist = jt.empty((batch_size, r_points, q_points), "float32") idxs, = jt.code( [out_idx_shapes], ['int32'], [x_r, x_q, tmp_dist], # in0 r point in1 q point cuda_src=self.cuda_src, cuda_header=self.cuda_inc, ) return idxs
def grad(self, grad): x, numangle, numrho = self.save_vars cuda_src_backward = csb.replace('#numangle', str(numangle)) cuda_src_backward = cuda_src_backward.replace('#numrho', str(numrho)) irho = int((h * h + w * w)**0.5 + 1) / float((numrho - 1)) itheta = 3.14159265358979323846 / numangle angle = jt.arange(numangle) * itheta tabCos = angle.cos() / irho tabSin = angle.sin() / irho return jt.code([x.shape], [x.dtype], [x, grad, tabCos, tabSin], cuda_src=cuda_src_backward)
def grad(self, grad): a, b = self.save_vars return jt.code([a.shape, b.shape], [a.dtype, b.dtype], [a, b, grad], cuda_src=''' __global__ static void kernel2(@ARGS_DEF) { @PRECALC for (int i=blockIdx.x; i<in0_shape0; i+=gridDim.x) for (int j=threadIdx.x; j<in0_shape1; j+=blockDim.x) { @out0(i,j) = @in2(i,j)*@in1(i,j); @out1(i,j) = @in2(i,j)*@in0(i,j); } } kernel2<<<32, 32>>>(@ARGS); ''')
def grad(self, grad): pc1, pc2, match, reduction = self.saved_vars if reduction == 'sum': grad = jt.ones([pc1.shape[0]]) * grad elif reduction == 'mean': grad = jt.ones([pc1.shape[0]]) * grad / pc1.shape[0] grad_pc1 = jt.code( shape=pc1.shape, dtype=pc1.dtype, inputs=[grad, pc1, pc2, match], cuda_src=matchcost_grad1_gpu_src, ) grad_pc2 = jt.code( shape=pc2.shape, dtype=pc2.dtype, inputs=[grad, pc1, pc2, match], cuda_src=matchcost_grad2_gpu_src, ) return grad_pc1, grad_pc2
def test_header(self): a = jt.array([3, 2, 1]) b = jt.code(a.shape, a.dtype, [a], cpu_header=""" #include <algorithm> @alias(a, in0) @alias(b, out) """, cpu_src=""" for (int i=0; i<a_shape0; i++) @b(i) = @a(i); std::sort(&@b(0), &@b(in0_shape0)); """) assert (b.data == [1, 2, 3]).all()
def execute(self,featuremap, boxes, box_ind): """ RoIAlign based on crop_and_resize. See more details on https://github.com/longcw/RoIAlign.pytorch :param featuremap: NxCxHxW :param boxes: Mx4 float box with (x1, y1, x2, y2) **without normalization** :param box_ind: M :return: MxCxoHxoW """ x1, y1, x2, y2 = [boxes.reindex([boxes.shape[0],1], ["i0", str(i)]) for i in range(4)] image_height, image_width = featuremap.shape[2:4] if self.transform_fpcoor: spacing_w = (x2 - x1) / float(self.crop_width) spacing_h = (y2 - y1) / float(self.crop_height) nx0 = (x1 + spacing_w / 2 - 0.5) / float(image_width - 1) ny0 = (y1 + spacing_h / 2 - 0.5) / float(image_height - 1) nw = spacing_w * float(self.crop_width - 1) / float(image_width - 1) nh = spacing_h * float(self.crop_height - 1) / float(image_height - 1) boxes = jt.contrib.concat((ny0, nx0, ny0 + nh, nx0 + nw), 1) else: x1 = x1 / float(image_width - 1) x2 = x2 / float(image_width - 1) y1 = y1 / float(image_height - 1) y2 = y2 / float(image_height - 1) boxes = jt.contrib.concat((y1, x1, y2, x2), 1) num_boxes = boxes.shape[0] depth = featuremap.shape[1] output_shapes = (num_boxes, depth, self.crop_height, self.crop_width) output_types = featuremap.dtype extrapolation_value = jt.array([self.extrapolation_value]) inputs = [featuremap,boxes,box_ind,extrapolation_value] cpu_header = ROIALIGN_CPU_HEADER cpu_src =ROIALIGN_CPU_SRC cpu_grad_src = ROIALIGN_CPU_GRAD_SRC cuda_header = ROIALIGN_CUDA_HEADER cuda_src= ROIALIGN_CUDA_SRC cuda_grad_src= ROIALIGN_CUDA_GRAD_SRC output = jt.code(output_shapes,output_types,inputs,cpu_header = cpu_header, cpu_src=cpu_src,cpu_grad_src=cpu_grad_src,cuda_header=cuda_header,cuda_src=cuda_src,cuda_grad_src=cuda_grad_src) return output
def roi_align(input, rois, output_size, spatial_scale, sampling_ratio): output_size = _pair(output_size) options = jt.array([spatial_scale, sampling_ratio]) output_shapes = (rois.shape[0], input.shape[1], output_size[0], output_size[1]) inputs = [input, rois, options] output_types = input.dtype if rois.shape[0] == 0: return jt.zeros(output_shapes, input.dtype) output = jt.code(output_shapes, output_types, inputs, cuda_header=CUDA_HEADER, cuda_src=CUDA_SRC, cuda_grad_src=CUDA_GRAD_SRC) return output
def test_cuda(self): a = jt.random([100000]) b = jt.random([100000]) c = jt.code(a.shape, a.dtype, [a, b], cuda_header=''' namespace jittor { __global__ static void kernel1(@ARGS_DEF) { @PRECALC int i = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; for (int i=0; i<in0shape0; i++) @out(i) = @in0(i)*@in1(i); } __global__ static void kernel2(@ARGS_DEF) { @PRECALC int i = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; for (int i=0; i<in0shape0; i++) @out(i) = @dout(i)*@in1(i); } __global__ static void kernel3(@ARGS_DEF) { @PRECALC int i = threadIdx.x + blockIdx.x * blockDim.x; int stride = blockDim.x * gridDim.x; for (int i=0; i<in0shape0; i++) @out(i) = @dout(i)*@in0(i); } } ''', cuda_src=''' kernel1<<<(in0shape0-1)/1024+1, 1024>>>(@ARGS); ''', cuda_grad_src=[ ''' kernel2<<<(in0shape0-1)/1024+1, 1024>>>(@ARGS); ''', ''' kernel3<<<(in0shape0-1)/1024+1, 1024>>>(@ARGS); ''' ]) da, db = jt.grad(c, [a, b]) assert np.allclose(c.data, a.data * b.data), (c.data, a.data * b.data) assert np.allclose(da.data, b.data) assert np.allclose(db.data, a.data)
def argmax_pool(x, size, stride, padding=0): y_shape = list(x.shape) y_shape[2] = (x.shape[2] + padding * 2 - size) // stride + 1 y_shape[3] = (x.shape[3] + padding * 2 - size) // stride + 1 y = jt.code(y_shape, x.dtype, [x], cpu_src=f''' for (int i=0; i<out_shape0; i++) for (int j=0; j<out_shape1; j++) for (int k=0; k<out_shape2; k++) for (int l=0; l<out_shape3; l++) {{ int kx=k*{stride}+{size}/2-{padding}; int ky=l*{stride}+{size}/2-{padding}; @out(i,j,k,l) = @in0(i,j,kx,ky); for (int p=kx-{size}/2;p<=kx+{size}/2;p++) for (int q=ky-{size}/2;q<=ky+{size}/2;q++) if (p>=0 && q>=0 && p<in0_shape2 && q<in0_shape3) if (@out(i,j,k,l) < @in0(i,j,p,q)) @out(i,j,k,l) = @in0(i,j,p,q); }} ''', cpu_grad_src=[ f''' for (int i=0; i<out_shape0; i++) for (int j=0; j<out_shape1; j++) for (int k=0; k<out_shape2; k++) for (int l=0; l<out_shape3; l++) @out(i,j,k,l) = 0; for (int i=0; i<pout_shape0; i++) for (int j=0; j<pout_shape1; j++) for (int k=0; k<pout_shape2; k++) for (int l=0; l<pout_shape3; l++) {{ int kx=k*{stride}+{size}/2-{padding}; int ky=l*{stride}+{size}/2-{padding}; int bo=1; for (int p=kx-{size}/2;p<=kx+{size}/2 && bo;p++) for (int q=ky-{size}/2;q<=ky+{size}/2 && bo;q++) if (p>=0 && q>=0 && p<in0_shape2 && q<in0_shape3) if (@pout(i,j,k,l) == @in0(i,j,p,q)) {{ @out(i,j,p,q) += @dout(i,j,k,l); bo=0; }} }} ''' ]) return y
def execute(self, x, numangle, numrho): n, c, h, w = x.shape cuda_src_forward = csf.replace('#numangle', str(numangle)) cuda_src_forward = cuda_src_forward.replace('#numrho', str(numrho)) irho = int((h * h + w * w)**0.5 + 1) / float((numrho - 1)) itheta = 3.14159265358979323846 / numangle angle = jt.arange(numangle) * itheta tabCos = angle.cos() / irho tabSin = angle.sin() / irho output = jt.code([n, c, numangle, numrho], x.dtype, [x, tabCos, tabSin], cuda_src=cuda_src_forward) self.save_vars = x, numangle, numrho return output
def test_multi_output(self): a = jt.array([3, 2, 1]) b, c = jt.code([[2], [4]], ["float32", "float64"], [a], cpu_src=""" @alias(a, in0) @alias(b, out0) @alias(c, out1) for (int i=0; i<a_shape0; i++) { if (i<b_shape0) @b(i) = @a(i); if (i<c_shape0) @c(i) = @a(i); } """) assert b.shape == [2] assert c.shape == [4] assert b.dtype == "float32" assert c.dtype == "float64" assert (b.data == [3, 2]).all() assert (c.data[:3] == [3, 2, 1]).all()