def mkiterlib(gnm): packer = interp.GenomePacker('iter_params', 'params', cuburn.genome.specs.anim) cp = packer.view(gnm) iterbody = iter_body(cp) bodies = [iter_xf_body(cp, i, x) for i, x in sorted(cp.xforms.items())] if 'final_xform' in cp: bodies.append(iter_xf_body(cp, 'final', cp.final_xform)) bodies.append(iterbody) packer_lib = packer.finalize() lib = devlib(deps=[packer_lib, mwclib, ringbuflib], # We grab the surf decl from palintlib as well decls=iter_decls + interp.palintlib.decls, defs='\n'.join(bodies)) return packer, lib
def finalize(self): """ Create the code to render this genome. """ # At the risk of packing a few things more than once, we don't # uniquify the overall precalc order, sparing us the need to implement # recursive code generation direct = list(self.packed_direct) + list(self.packed_direct_mag) self.packed = direct + list(self.packed_precalc) self.genome = direct + list(self.genome_precalc) self._len = len(self.packed) decls = self._decls.substitute(**self.__dict__) defs = self._defs.substitute(**self.__dict__) return devlib(deps=[catmullromlib], decls=decls, defs=defs)
def mkiterlib(gnm): packer = interp.GenomePacker('iter_params', 'params', cuburn.genome.specs.anim) cp = packer.view(gnm) iterbody = iter_body(cp) bodies = [iter_xf_body(cp, i, x) for i, x in sorted(cp.xforms.items())] if 'final_xform' in cp: bodies.append(iter_xf_body(cp, 'final', cp.final_xform)) bodies.append(iterbody) packer_lib = packer.finalize() lib = devlib( deps=[packer_lib, mwclib, ringbuflib], # We grab the surf decl from palintlib as well decls=iter_decls + interp.palintlib.decls, defs='\n'.join(bodies)) return packer, lib
rgba8lib = devlib(deps=[ringbuflib, mwclib], defs=r''' // Perform a conversion from float32 values to uint8 ones, applying // pixel- and channel-independent dithering to reduce suprathreshold banding // artifacts. Clamps values larger than 1.0f. // TODO: move to a separate module? // TODO: less ineffecient mwc_st handling? __global__ void f32_to_rgba_u8( uchar4 *dst, const float4 *src, int gutter, int dstride, int sstride, int height, ringbuf *rb, mwc_st *rctxs) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x > dstride || y > height) return; int isrc = sstride * (y + gutter) + x + gutter; int tid = blockDim.x * threadIdx.y + threadIdx.x; mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; float4 in = src[isrc]; uchar4 out = make_uchar4( fminf(1.0f, in.x) * 255.0f + 0.49f * mwc_next_11(rctx), fminf(1.0f, in.y) * 255.0f + 0.49f * mwc_next_11(rctx), fminf(1.0f, in.z) * 255.0f + 0.49f * mwc_next_11(rctx), fminf(1.0f, in.w) * 255.0f + 0.49f * mwc_next_11(rctx) ); int idst = dstride * y + x; dst[idst] = out; rctxs[rb_incr(rb->tail, tid)] = rctx; } ''')
catmullromlib = devlib(deps=[binsearchlib], decls=r''' __device__ __noinline__ float catmull_rom(const float *times, const float *knots, float t); __device__ __noinline__ float catmull_rom_mag(const float *times, const float *knots, float t); ''', defs=r''' // ELBOW is the linearization threhsold; above this magnitude, a value scales // logarithmically, and below it, linearly. ELOG1 is a constant used to make // this happen. See helpers/spline_mag_domain_interp.wxm for nice graphs. #define ELBOW 0.0625f // 2^(-4) #define ELOG1 5.0f // 1 - log2(elbow) // Transform from linear to magnitude domain __device__ float linlog(float x) { if (x > ELBOW) return log2f(x) + ELOG1; if (x < -ELBOW) return -(log2f(-x) + ELOG1); return x / ELBOW; } // Reverse of above __device__ float linexp(float v) { if (v >= 1.0) return exp2f( v - ELOG1); if (v <= -1.0) return -exp2f(-v - ELOG1); return v * ELBOW; } __device__ float linslope(float x, float m) { if (x >= ELBOW) return m / x; if (x <= -ELBOW) return m / -x; return m / ELBOW; } __device__ float catmull_rom_base(const float *times, const float *knots, float t, bool mag) { int idx = bitwise_binsearch(times, t); // The left bias of the search means that we never have to worry about // overshooting unless the genome is corrupted idx = max(idx, 1); float t1 = times[idx], t2 = times[idx+1] - t1; float rt2 = 1.0f / t2; float t0 = (times[idx-1] - t1) * rt2, t3 = (times[idx+2] - t1) * rt2; t = (t - t1) * rt2; // Now t1 is effectively 0 and t2 is 1 float k0 = knots[idx-1], k1 = knots[idx], k2 = knots[idx+1], k3 = knots[idx+2]; float m1 = (k2 - k0) / (1.0f - t0), m2 = (k3 - k1) / (t3); if (mag) { m1 = linslope(k1, m1); m2 = linslope(k2, m2); k1 = linlog(k1); k2 = linlog(k2); } float tt = t * t, ttt = tt * t; float r = m1 * ( ttt - 2.0f*tt + t) + k1 * ( 2.0f*ttt - 3.0f*tt + 1) + m2 * ( ttt - tt) + k2 * (-2.0f*ttt + 3.0f*tt); if (mag) r = linexp(r); return r; } // Variants with scaling domain logic inlined __device__ __noinline__ float catmull_rom(const float *times, const float *knots, float t) { return catmull_rom_base(times, knots, t, false); } __device__ __noinline__ float catmull_rom_mag(const float *times, const float *knots, float t) { return catmull_rom_base(times, knots, t, true); } ''')
texshearlib = devlib(defs=r''' // Filter directions specified in degrees, using image/texture addressing // [(0,0) is upper left corner, 90 degrees is down]. __constant__ float2 addressing_patterns[16] = { { 1.0f, 0.0f}, { 0.0f, 1.0f}, // 0, 1: 0, 90 { 1.0f, 1.0f}, {-1.0f, 1.0f}, // 2, 3: 45, 135 { 1.0f, 0.5f}, {-0.5f, 1.0f}, // 4, 5: 22.5, 112.5 { 1.0f, -0.5f}, { 0.5f, 1.0f}, // 6, 7: -22.5, 67.5 { 1.0f, 0.666667f}, {-0.666667f, 1.0f}, // 8, 9: 30, 120 { 1.0f, -0.666667f}, { 0.666667f, 1.0f}, // 10, 11: -30, 60 { 1.0f, 0.333333f}, {-0.333333f, 1.0f}, // 12, 13: 15, 105 { 1.0f, -0.333333f}, { 0.333333f, 1.0f}, // 14, 15: -15, 75 }; // Mon dieu! A C++ feature? Gotta close the "extern C" added by the compiler. } template <typename T> __device__ T tex_shear(texture<T, cudaTextureType2D> ref, int pattern, float x, float y, float radius) { float2 scale = addressing_patterns[pattern]; float i = scale.x * radius, j = scale.y * radius; // Round i and j to the nearest integer, choosing the nearest even when // equidistant. It's critical that this be done before adding 'x' and 'y', // so that addressing patterns remain consistent across the grid. asm("{\n\t" "cvt.rni.ftz.f32.f32 %0, %0;\n\t" "cvt.rni.ftz.f32.f32 %1, %1;\n\t" "}\n" : "+f"(i), "+f"(j)); return tex2D(ref, x + i, y + j); } extern "C" { ''')
pixfmtlib = devlib(deps=[ringbuflib, mwclib], defs=r''' // Clamp an input between 0 and a given peak (inclusive), dithering its output, // with full clamping for pixels that are true-black for compressibility. __device__ float dclampf(mwc_st &rctx, float peak, float in) { float ret = 0.0f; if (in > 0.0f) { ret = fminf(peak, in * peak + 0.99f * mwc_next_01(rctx)); } return ret; } // Perform a conversion from float32 values to uint8 ones, applying // pixel- and channel-independent dithering to reduce suprathreshold banding // artifacts. Clamps values larger than 1.0f. // TODO: move to a separate module? // TODO: less ineffecient mwc_st handling? __global__ void f32_to_rgba_u8( uchar4 *dst, const float4 *src, int gutter, int dstride, int sstride, int height, ringbuf *rb, mwc_st *rctxs) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x > dstride || y > height) return; int isrc = sstride * (y + gutter) + x + gutter; int tid = blockDim.x * threadIdx.y + threadIdx.x; mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; float4 in = src[isrc]; uchar4 out = make_uchar4( dclampf(rctx, 255.0f, in.x), dclampf(rctx, 255.0f, in.y), dclampf(rctx, 255.0f, in.z), dclampf(rctx, 255.0f, in.w) ); int idst = dstride * y + x; dst[idst] = out; rctxs[rb_incr(rb->tail, tid)] = rctx; } // Perform a conversion from float32 values to uint16 ones, as above. __global__ void f32_to_rgba_u16( ushort4 *dst, const float4 *src, int gutter, int dstride, int sstride, int height, ringbuf *rb, mwc_st *rctxs) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x > dstride || y > height) return; int isrc = sstride * (y + gutter) + x + gutter; int tid = blockDim.x * threadIdx.y + threadIdx.x; mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; float4 in = src[isrc]; ushort4 out = make_ushort4( dclampf(rctx, 65535.0f, in.x), dclampf(rctx, 65535.0f, in.y), dclampf(rctx, 65535.0f, in.z), dclampf(rctx, 65535.0f, in.w) ); int idst = dstride * y + x; dst[idst] = out; rctxs[rb_incr(rb->tail, tid)] = rctx; } // Convert from rgb444 to planar YUV with no chroma subsampling. // Uses JPEG full-range color primaries. __global__ void f32_to_yuv444p( char *dst, const float4 *src, int gutter, int dstride, int sstride, int height, ringbuf *rb, mwc_st *rctxs) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x > dstride || y > height) return; int isrc = sstride * (y + gutter) + x + gutter; int tid = blockDim.x * threadIdx.y + threadIdx.x; mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; float4 in = src[isrc]; uchar3 out = make_uchar3( dclampf(rctx, 255.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z), dclampf(rctx, 255.0f, -0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f), dclampf(rctx, 255.0f, 0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z + 0.5f) ); int idst = dstride * y + x; dst[idst] = out.x; idst += dstride * height; dst[idst] = out.y; idst += dstride * height; dst[idst] = out.z; rctxs[rb_incr(rb->tail, tid)] = rctx; } // Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries. // TODO(strobe): Decide how YouTube will handle Rec. 2020, and then do that here. __global__ void f32_to_yuv444p10( uint16_t *dst, const float4 *src, int gutter, int dstride, int sstride, int height, ringbuf *rb, mwc_st *rctxs) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x > dstride || y > height) return; int isrc = sstride * (y + gutter) + x + gutter; int tid = blockDim.x * threadIdx.y + threadIdx.x; mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; float4 in = src[isrc]; ushort3 out = make_ushort3( dclampf(rctx, 1023.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z), dclampf(rctx, 1023.0f, -0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f), dclampf(rctx, 1023.0f, 0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z + 0.5f) ); int idst = dstride * y + x; dst[idst] = out.x; idst += dstride * height; dst[idst] = 1023.0f * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f); idst += dstride * height; dst[idst] = out.z; rctxs[rb_incr(rb->tail, tid)] = rctx; } // Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries. // Perform subsampling of chroma using weighted averages. __global__ void f32_to_yuv420p10( uint16_t *dst, const float4 *src, int gutter, int dstride, int sstride, int height, ringbuf *rb, mwc_st *rctxs) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x > dstride || y > height) return; int tid = blockDim.x * threadIdx.y + threadIdx.x; mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; // Perform luma using real addressing int isrc = sstride * (y + gutter) + x + gutter; int idst = dstride * y + x; float4 in = src[isrc]; dst[idst] = dclampf(rctx, 1023.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z); // Drop into subsampling mode for chroma components if (x * 2 > dstride || y * 2 > height) return; // Recompute addressing and collect weighted averages // TODO(strobe): characterize overflow here isrc = sstride * (y * 2 + gutter) + x * 2 + gutter; in = src[isrc]; float sum = in.w + 1e-12; float cb = in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z); float cr = in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z); in = src[isrc + 1]; sum += in.w; cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z); cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z); isrc += sstride; in = src[isrc]; sum += in.w; cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z); cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z); in = src[isrc + 1]; sum += in.w; cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z); cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z); // For this to work, dstride must equal the output frame width // and be a multiple of four. idst = dstride * height + dstride / 2 * y + x; dst[idst] = dclampf(rctx, 1023.0f, cb / sum + 0.5f); idst += dstride * height / 4; dst[idst] = dclampf(rctx, 1023.0f, cr / sum + 0.5f); rctxs[rb_incr(rb->tail, tid)] = rctx; } // Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries. // TODO(strobe): Share more code. __global__ void f32_to_yuv444p12( uint16_t *dst, const float4 *src, int gutter, int dstride, int sstride, int height, ringbuf *rb, mwc_st *rctxs) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x > dstride || y > height) return; int isrc = sstride * (y + gutter) + x + gutter; int tid = blockDim.x * threadIdx.y + threadIdx.x; mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; float4 in = src[isrc]; in.x = fminf(1.0f, fmaxf(0.0f, in.x)); in.y = fminf(1.0f, fmaxf(0.0f, in.y)); in.z = fminf(1.0f, fmaxf(0.0f, in.z)); ushort3 out = make_ushort3( dclampf(rctx, 4095.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z), dclampf(rctx, 4095.0f, -0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f), dclampf(rctx, 4095.0f, 0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z + 0.5f) ); int idst = dstride * y + x; dst[idst] = out.x; idst += dstride * height; dst[idst] = out.y; idst += dstride * height; dst[idst] = out.z; rctxs[rb_incr(rb->tail, tid)] = rctx; } ''')
mwclib = devlib(decls=r''' typedef struct { uint32_t mul; uint32_t state; uint32_t carry; } mwc_st; ''', defs=r''' __device__ uint32_t mwc_next(mwc_st &st) { asm("{\n\t" ".reg .u32 tmp;\n\t" "mad.lo.cc.u32 tmp, %2, %1, %0;\n\t" "madc.hi.u32 %0, %2, %1, 0;\n\t" "mov.u32 %1, tmp;\n\t" "}" : "+r"(st.carry), "+r"(st.state) : "r"(st.mul)); return st.state; } __device__ float mwc_next_01(mwc_st &st) { return mwc_next(st) * (1.0f / 4294967296.0f); } __device__ float mwc_next_11(mwc_st &st) { uint32_t val = mwc_next(st); float ret; asm("cvt.rn.f32.s32 %0, %1;\n\t" "mul.f32 %0, %0, (1.0 / 2147483648.0);" : "=f"(ret) : "r"(val)); return ret; } ''')
pixfmtlib = devlib(deps=[ringbuflib, mwclib], defs=r''' // Clamp an input between 0 and a given peak (inclusive), dithering its output, // with full clamping for pixels that are true-black for compressibility. __device__ float dclampf(mwc_st &rctx, float peak, float in) { float ret = 0.0f; if (in > 0.0f) { ret = fminf(peak, in * peak + 0.99f * mwc_next_01(rctx)); } return ret; } // Perform a conversion from float32 values to uint8 ones, applying // pixel- and channel-independent dithering to reduce suprathreshold banding // artifacts. Clamps values larger than 1.0f. // TODO: move to a separate module? // TODO: less ineffecient mwc_st handling? __global__ void f32_to_rgba_u8( uchar4 *dst, const float4 *src, int gutter, int dstride, int sstride, int height, ringbuf *rb, mwc_st *rctxs) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x > dstride || y > height) return; int isrc = sstride * (y + gutter) + x + gutter; int tid = blockDim.x * threadIdx.y + threadIdx.x; mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; float4 in = src[isrc]; uchar4 out = make_uchar4( dclampf(rctx, 255.0f, in.x), dclampf(rctx, 255.0f, in.y), dclampf(rctx, 255.0f, in.z), dclampf(rctx, 255.0f, in.w) ); int idst = dstride * y + x; dst[idst] = out; rctxs[rb_incr(rb->tail, tid)] = rctx; } // Perform a conversion from float32 values to uint16 ones, as above. __global__ void f32_to_rgba_u16( ushort4 *dst, const float4 *src, int gutter, int dstride, int sstride, int height, ringbuf *rb, mwc_st *rctxs) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x > dstride || y > height) return; int isrc = sstride * (y + gutter) + x + gutter; int tid = blockDim.x * threadIdx.y + threadIdx.x; mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; float4 in = src[isrc]; ushort4 out = make_ushort4( dclampf(rctx, 65535.0f, in.x), dclampf(rctx, 65535.0f, in.y), dclampf(rctx, 65535.0f, in.z), dclampf(rctx, 65535.0f, in.w) ); int idst = dstride * y + x; dst[idst] = out; rctxs[rb_incr(rb->tail, tid)] = rctx; } // Convert from rgb444 to planar YUV with no chroma subsampling. // Uses JPEG full-range color primaries. __global__ void f32_to_yuv444p( char *dst, const float4 *src, int gutter, int dstride, int sstride, int height, ringbuf *rb, mwc_st *rctxs) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x > dstride || y > height) return; int isrc = sstride * (y + gutter) + x + gutter; int tid = blockDim.x * threadIdx.y + threadIdx.x; mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; float4 in = src[isrc]; uchar3 out = make_uchar3( dclampf(rctx, 255.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z), dclampf(rctx, 255.0f, -0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f), dclampf(rctx, 255.0f, 0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z + 0.5f) ); int idst = dstride * y + x; dst[idst] = out.x; idst += dstride * height; dst[idst] = out.y; idst += dstride * height; dst[idst] = out.z; rctxs[rb_incr(rb->tail, tid)] = rctx; } // Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries. // TODO(strobe): Decide how YouTube will handle Rec. 2020, and then do that here. __global__ void f32_to_yuv444p10( uint16_t *dst, const float4 *src, int gutter, int dstride, int sstride, int height, ringbuf *rb, mwc_st *rctxs) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x > dstride || y > height) return; int isrc = sstride * (y + gutter) + x + gutter; int tid = blockDim.x * threadIdx.y + threadIdx.x; mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; float4 in = src[isrc]; ushort3 out = make_ushort3( dclampf(rctx, 1023.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z), dclampf(rctx, 1023.0f, -0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f), dclampf(rctx, 1023.0f, 0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z + 0.5f) ); int idst = dstride * y + x; dst[idst] = out.x; idst += dstride * height; dst[idst] = 1023.0f * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f); idst += dstride * height; dst[idst] = out.z; rctxs[rb_incr(rb->tail, tid)] = rctx; } // Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries. // Perform subsampling of chroma using weighted averages. __global__ void f32_to_yuv420p10( uint16_t *dst, const float4 *src, int gutter, int dstride, int sstride, int height, ringbuf *rb, mwc_st *rctxs) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x > dstride || y > height) return; int tid = blockDim.x * threadIdx.y + threadIdx.x; mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; // Perform luma using real addressing int isrc = sstride * (y + gutter) + x + gutter; int idst = dstride * y + x; float4 in = src[isrc]; dst[idst] = dclampf(rctx, 1023.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z); // Drop into subsampling mode for chroma components if (x * 2 > dstride || y * 2 > height) return; // Recompute addressing and collect weighted averages // TODO(strobe): characterize overflow here isrc = sstride * (y * 2 + gutter) + x * 2 + gutter; in = src[isrc]; float sum = in.w + 1e-12; float cb = in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z); float cr = in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z); in = src[isrc + 1]; sum += in.w; cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z); cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z); isrc += sstride; in = src[isrc]; sum += in.w; cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z); cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z); in = src[isrc + 1]; sum += in.w; cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z); cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z); // For this to work, dstride must equal the output frame width // and be a multiple of four. idst = dstride * height + dstride / 2 * y + x; dst[idst] = dclampf(rctx, 1023.0f, cb / sum + 0.5f); idst += dstride * height / 4; dst[idst] = dclampf(rctx, 1023.0f, cr / sum + 0.5f); rctxs[rb_incr(rb->tail, tid)] = rctx; } // Convert from rgb444 to planar YUV 12-bit studio swing, // using the Rec. 709 matrix. __global__ void f32_to_yuv444p12( uint16_t *dst, const float4 *src, int gutter, int dstride, int sstride, int height, ringbuf *rb, mwc_st *rctxs) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x > dstride || y > height) return; int isrc = sstride * (y + gutter) + x + gutter; int tid = blockDim.x * threadIdx.y + threadIdx.x; mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; float4 in = src[isrc]; in.x = fminf(1.0f, fmaxf(0.0f, in.x)); in.y = fminf(1.0f, fmaxf(0.0f, in.y)); in.z = fminf(1.0f, fmaxf(0.0f, in.z)); ushort3 out = make_ushort3( dclampf(rctx, 3504.0f, 0.2126f * in.x + 0.7152f * in.y + 0.0722f * in.z) + 256.0f, dclampf(rctx, 3584.0f, -0.11457f * in.x - 0.38543f * in.y + 0.5f * in.z + 0.5f) + 256.0f, dclampf(rctx, 3584.0f, 0.5f * in.x - 0.45416f * in.y - 0.04585f * in.z + 0.5f) + 256.0f ); int idst = dstride * y + x; dst[idst] = out.x; idst += dstride * height; dst[idst] = out.y; idst += dstride * height; dst[idst] = out.z; rctxs[rb_incr(rb->tail, tid)] = rctx; } ''')
yuvlib = devlib(defs=''' __device__ float3 rgb2yuv(float3 rgb); __device__ float3 yuv2rgb(float3 yuv); ''', decls=r''' /* This conversion uses the JPEG full-range standard. Note that UV have range * [-0.5, 0.5], so consider biasing the results. */ __device__ float3 rgb2yuv(float3 rgb) { return make_float3( 0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z, -0.168736f * rgb.x - 0.331264f * rgb.y + 0.5f * rgb.z, 0.5f * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z); } __device__ float3 yuv2rgb(float3 yuv) { return make_float3( yuv.x + 1.402f * yuv.z, yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z, yuv.x + 1.772f * yuv.y); } // As used in the various cliplibs. __device__ void yuvo2rgb(float4& pix) { pix.y -= 0.5f * pix.w; pix.z -= 0.5f * pix.w; float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z)); pix.x = fmaxf(0.0f, tmp.x); pix.y = fmaxf(0.0f, tmp.y); pix.z = fmaxf(0.0f, tmp.z); } ''')
flushatomlib = devlib(defs=Template(r''' __global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) { int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; if (i >= nbins) return; asm volatile ({{crep(""" { .reg .u32 off, hi, lo, d, y, u, v; .reg .u64 val, ptr; .reg .f32 yf, uf, vf, df, yg, ug, vg, dg; // TODO: use explicit movs to handle this shl.b32 off, %0, 3; cvt.u64.u32 ptr, off; add.u64 ptr, ptr, %1; ld.global.v2.u32 {lo, hi}, [ptr]; shl.b32 off, %0, 4; cvt.u64.u32 ptr, off; add.u64 ptr, ptr, %2; ld.global.v4.f32 {yg,ug,vg,dg}, [ptr]; shr.u32 d, hi, 22; bfe.u32 y, hi, 4, 18; bfe.u32 u, lo, 18, 14; bfi.b32 u, hi, u, 14, 4; and.b32 v, lo, ((1<<18)-1); cvt.rn.f32.u32 yf, y; cvt.rn.f32.u32 uf, u; cvt.rn.f32.u32 vf, v; cvt.rn.f32.u32 df, d; fma.rn.ftz.f32 yg, yf, (1.0/255.0), yg; fma.rn.ftz.f32 ug, uf, (1.0/255.0), ug; fma.rn.ftz.f32 vg, vf, (1.0/255.0), vg; add.rn.ftz.f32 dg, df, dg; st.global.v4.f32 [ptr], {yg,ug,vg,dg}; } """)}} :: "r"(i), "l"(atom_ptr), "l"(out_ptr)); } ''', 'flush_atom').substitute())
flushatomlib = devlib(defs=Template( r''' __global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) { int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; if (i >= nbins) return; asm volatile ({{crep(""" { .reg .u32 off, hi, lo, d, y, u, v; .reg .u64 val, ptr; .reg .f32 yf, uf, vf, df, yg, ug, vg, dg; // TODO: use explicit movs to handle this shl.b32 off, %0, 3; cvt.u64.u32 ptr, off; add.u64 ptr, ptr, %1; ld.global.v2.u32 {lo, hi}, [ptr]; shl.b32 off, %0, 4; cvt.u64.u32 ptr, off; add.u64 ptr, ptr, %2; ld.global.v4.f32 {yg,ug,vg,dg}, [ptr]; shr.u32 d, hi, 22; bfe.u32 y, hi, 4, 18; bfe.u32 u, lo, 18, 14; bfi.b32 u, hi, u, 14, 4; and.b32 v, lo, ((1<<18)-1); cvt.rn.f32.u32 yf, y; cvt.rn.f32.u32 uf, u; cvt.rn.f32.u32 vf, v; cvt.rn.f32.u32 df, d; fma.rn.ftz.f32 yg, yf, (1.0/255.0), yg; fma.rn.ftz.f32 ug, uf, (1.0/255.0), ug; fma.rn.ftz.f32 vg, vf, (1.0/255.0), vg; add.rn.ftz.f32 dg, df, dg; st.global.v4.f32 [ptr], {yg,ug,vg,dg}; } """)}} :: "r"(i), "l"(atom_ptr), "l"(out_ptr)); } ''', 'flush_atom').substitute())
catmullromlib = devlib( deps=[binsearchlib], decls=r""" __device__ __noinline__ float catmull_rom(const float *times, const float *knots, float t); __device__ __noinline__ float catmull_rom_mag(const float *times, const float *knots, float t); """, defs=r""" // ELBOW is the linearization threhsold; above this magnitude, a value scales // logarithmically, and below it, linearly. ELOG1 is a constant used to make // this happen. See helpers/spline_mag_domain_interp.wxm for nice graphs. #define ELBOW 0.0625f // 2^(-4) #define ELOG1 5.0f // 1 - log2(elbow) // Transform from linear to magnitude domain __device__ float linlog(float x) { if (x > ELBOW) return log2f(x) + ELOG1; if (x < -ELBOW) return -(log2f(-x) + ELOG1); return x / ELBOW; } // Reverse of above __device__ float linexp(float v) { if (v >= 1.0) return exp2f( v - ELOG1); if (v <= -1.0) return -exp2f(-v - ELOG1); return v * ELBOW; } __device__ float linslope(float x, float m) { if (x >= ELBOW) return m / x; if (x <= -ELBOW) return m / -x; return m / ELBOW; } __device__ float catmull_rom_base(const float *times, const float *knots, float t, bool mag) { int idx = bitwise_binsearch(times, t); // The left bias of the search means that we never have to worry about // overshooting unless the genome is corrupted idx = max(idx, 1); float t1 = times[idx], t2 = times[idx+1] - t1; float rt2 = 1.0f / t2; float t0 = (times[idx-1] - t1) * rt2, t3 = (times[idx+2] - t1) * rt2; t = (t - t1) * rt2; // Now t1 is effectively 0 and t2 is 1 float k0 = knots[idx-1], k1 = knots[idx], k2 = knots[idx+1], k3 = knots[idx+2]; float m1 = (k2 - k0) / (1.0f - t0), m2 = (k3 - k1) / (t3); if (mag) { m1 = linslope(k1, m1); m2 = linslope(k2, m2); k1 = linlog(k1); k2 = linlog(k2); } float tt = t * t, ttt = tt * t; float r = m1 * ( ttt - 2.0f*tt + t) + k1 * ( 2.0f*ttt - 3.0f*tt + 1) + m2 * ( ttt - tt) + k2 * (-2.0f*ttt + 3.0f*tt); if (mag) r = linexp(r); return r; } // Variants with scaling domain logic inlined __device__ __noinline__ float catmull_rom(const float *times, const float *knots, float t) { return catmull_rom_base(times, knots, t, false); } __device__ __noinline__ float catmull_rom_mag(const float *times, const float *knots, float t) { return catmull_rom_base(times, knots, t, true); } """, )