def slicer(self): if self.cache is None: self.cache = np.empty(len(self), dtype=self.factor.slicer.dtype) slices = self.factor.slicer(0, len(self.factor), SLICELEN) def f(offset, slice): n = len(slice) self.cache[offset:(offset + n)] = slice return offset + n Seq.reduce(f, 0, slices) return VectorSlicer(self.cache)
def xgblogit(label, factors, trainselector=None, mu=0.5, eta=0.1, lambda_=1.0, gamma=0.0, maxdepth=2, nrounds=2, minh=1.0, slicelen=10000): f0 = np.empty(len(label), dtype=np.float32) f0.fill(logitraw(mu)) label = label.to_array() g = np.zeros(len(f0), dtype=np.float32) h = np.zeros(len(f0), dtype=np.float32) def step(x, m): fm, trees = x if trainselector is not None: get_gh_sel(trainselector, fm, label, g, h) else: get_gh(fm, label, g, h) g_cov = Covariate.from_array(g) h_cov = Covariate.from_array(h) tree, predraw = growtree(factors, g_cov, h_cov, fm, eta, maxdepth, lambda_, gamma, minh, slicelen) trees.append(tree) return (predraw, trees) fm, trees = Seq.reduce(step, (f0, []), Seq.from_gen((i for i in range(nrounds)))) get_pred(fm) return trees, fm
def get_hist_slice(gsum0, hsum0, nodeids, nodecansplit, factor, gcovariate, hcovariate, start, length, slicelen): nodeslices = VectorSlicer(nodeids)(start, length, slicelen) factorslices = factor.slicer(start, length, slicelen) gslices = gcovariate.slicer(start, length, slicelen) hslices = hcovariate.slicer(start, length, slicelen) zipslices = Seq.zip(nodeslices, factorslices, gslices, hslices) return Seq.reduce(f_hist, (gsum0, hsum0, nodecansplit), zipslices)
def __repr__(self): slices = self.slicer(0, min(HEADLENGTH, len(self)), HEADLENGTH) def f(acc, slice): return acc + ' '.join([str(v) for v in slice]) + " " datahead = Seq.reduce(f, "", slices) return "BoolVariate {var} with {len} obs: {head}".format(var=self.name, len=len(self), head=datahead)
def get_freq(self): slices = self.slicer(0, len(self), SLICELEN) counts0 = np.zeros(len(self.levels), dtype=np.int32) @jit(nopython=True, cache=True) def f(acc, slice): for v in slice: acc[v] = acc[v] + 1 return acc return Seq.reduce(f, counts0, slices)
def __repr__(self): k = min(HEADLENGTH, len(self)) slices = self.slicer(0, k, HEADLENGTH) levels = self.levels def f(acc, slice): return acc + ' '.join([levels[i] for i in slice]) + " " datahead = Seq.reduce(f, "", slices) s = "" if k == len(self) else "..." return "Factor {f} with {len} obs and {n} levels: {head}{s}".format( f=self.name, len=len(self), head=datahead, n=len(levels) - 1, s=s)
def __repr__(self): k = min(HEADLENGTH, len(self)) slices = self.slicer(0, k, HEADLENGTH) def f(acc, slice): return acc + ' '.join( ["." if np.isnan(v) else str(v) for v in slice]) + " " datahead = Seq.reduce(f, "", slices) s = "" if k == len(self) else "..." return "Covariate {cov} with {len} obs: {head}{s}".format( cov=self.name, len=len(self), head=datahead, s=s)
def unique(self): slices = self.slicer(0, len(self), SLICELEN) # v, tail = Seq.try_read(slices) # set0 = set(v) # @jit(nopython=True, cache=True) # def f(acc, slice): # for v in slice: # if not np.isnan(v): # acc.add(v) # return acc # res = Seq.reduce(f, set0, tail) dt = types.float32 if self.slicer.dtype == np.float32 else types.float64 set0 = Dict.empty(key_type=dt, value_type=dt) res = Seq.reduce(f_unique, set0, slices) arr = np.array(list(res.keys()), dtype=self.slicer.dtype) arr.sort() return arr