def xgblogit(label, factors, trainselector=None, mu=0.5, eta=0.1, lambda_=1.0, gamma=0.0, maxdepth=2, nrounds=2, minh=1.0, slicelen=10000): f0 = np.empty(len(label), dtype=np.float32) f0.fill(logitraw(mu)) label = label.to_array() g = np.zeros(len(f0), dtype=np.float32) h = np.zeros(len(f0), dtype=np.float32) def step(x, m): fm, trees = x if trainselector is not None: get_gh_sel(trainselector, fm, label, g, h) else: get_gh(fm, label, g, h) g_cov = Covariate.from_array(g) h_cov = Covariate.from_array(h) tree, predraw = growtree(factors, g_cov, h_cov, fm, eta, maxdepth, lambda_, gamma, minh, slicelen) trees.append(tree) return (predraw, trees) fm, trees = Seq.reduce(step, (f0, []), Seq.from_gen((i for i in range(nrounds)))) get_pred(fm) return trees, fm
def splitnodeidsslice(nodeids, factors, issplitnode, nodemap, leftpartitions, factorindex, start, length, slicelength): if len(factors) > 0: factorslices = NFactorSlicer(factors)(start, length, slicelength) nodeslices = VectorSlicer(nodeids)(start, length, slicelength) Seq.foreach(lambda x: f_splitnode(x[0], x[1], issplitnode, leftpartitions, factorindex, nodemap), Seq.zip(nodeslices, factorslices))
def get_hist_slice(gsum0, hsum0, nodeids, nodecansplit, factor, gcovariate, hcovariate, start, length, slicelen): nodeslices = VectorSlicer(nodeids)(start, length, slicelen) factorslices = factor.slicer(start, length, slicelen) gslices = gcovariate.slicer(start, length, slicelen) hslices = hcovariate.slicer(start, length, slicelen) zipslices = Seq.zip(nodeslices, factorslices, gslices, hslices) return Seq.reduce(f_hist, (gsum0, hsum0, nodecansplit), zipslices)
def __call__(self, start, length, slicelen): assert start >= 0 and start < len(self.vector) length = min(length, len(self.vector) - start) slicelen = min(length, slicelen) state = start, length, slicelen def f(from_to): return self.vector[from_to[0]:from_to[1]] return Seq.map(f, Seq.from_next(state, next_slice_indices))
def slicer(start, length, slicelen): length = min(len(self) - start, length) slicelen = min(length, slicelen) buf = np.empty(slicelen, dtype=dtype) if rightclosed: return Seq.map((lambda s: g_rightclosed(s, buf, cuts)), covariate.slicer(start, length, slicelen)) else: return Seq.map( (lambda s: g_leftclosed(s, buf, cuts, levelcount - 1)), covariate.slicer(start, length, slicelen))
def slicer(self): if self.cache is None: self.cache = np.empty(len(self), dtype=self.factor.slicer.dtype) slices = self.factor.slicer(0, len(self.factor), SLICELEN) def f(offset, slice): n = len(slice) self.cache[offset:(offset + n)] = slice return offset + n Seq.reduce(f, 0, slices) return VectorSlicer(self.cache)
def growtree(factors, gcovariate, hcovariate, fm, eta, maxdepth, lambda_, gamma, minh, slicelen): length = len(gcovariate) maxnodecount = 2 ** maxdepth nodeids = np.zeros(length, dtype=np.uint8) if maxnodecount <= np.iinfo(np.uint8).max else np.zeros(length, dtype=np.uint16) loss0 = np.finfo(np.float32).max nodes0 = [LeafNode((0.0, 0.0), True, {f : np.full(len(f.levels), True) for f in factors}, loss0)] state0 = TreeGrowState(nodeids, nodes0, factors, gcovariate, hcovariate, gamma, lambda_, minh, slicelen) layers = Seq.tolist(Seq.take(Seq.from_next(state0, nextlayer), maxdepth)) predict(state0.nodes, nodeids, fm, eta, lambda_) return layers, fm
def __call__(self, start, length, slicelen): assert start >= 0 slicelen = min(length, slicelen) iostream = open(self.path, "rb") buf = np.empty(slicelen, dtype=self.dtype) itemsize = buf.dtype.itemsize iostream.seek(itemsize * start) state = (buf, iostream, start, length, slicelen) return Seq.from_next(state, next_data_chunk)
def __repr__(self): slices = self.slicer(0, min(HEADLENGTH, len(self)), HEADLENGTH) def f(acc, slice): return acc + ' '.join([str(v) for v in slice]) + " " datahead = Seq.reduce(f, "", slices) return "BoolVariate {var} with {len} obs: {head}".format(var=self.name, len=len(self), head=datahead)
def get_freq(self): slices = self.slicer(0, len(self), SLICELEN) counts0 = np.zeros(len(self.levels), dtype=np.int32) @jit(nopython=True, cache=True) def f(acc, slice): for v in slice: acc[v] = acc[v] + 1 return acc return Seq.reduce(f, counts0, slices)
def __repr__(self): k = min(HEADLENGTH, len(self)) slices = self.slicer(0, k, HEADLENGTH) def f(acc, slice): return acc + ' '.join( ["." if np.isnan(v) else str(v) for v in slice]) + " " datahead = Seq.reduce(f, "", slices) s = "" if k == len(self) else "..." return "Covariate {cov} with {len} obs: {head}{s}".format( cov=self.name, len=len(self), head=datahead, s=s)
def __repr__(self): k = min(HEADLENGTH, len(self)) slices = self.slicer(0, k, HEADLENGTH) levels = self.levels def f(acc, slice): return acc + ' '.join([levels[i] for i in slice]) + " " datahead = Seq.reduce(f, "", slices) s = "" if k == len(self) else "..." return "Factor {f} with {len} obs and {n} levels: {head}{s}".format( f=self.name, len=len(self), head=datahead, n=len(levels) - 1, s=s)
def __call__(self, start, length, slicelen): dtypes = [f.slicer.dtype for f in self.factors] alluint8 = all([d == np.uint8 for d in dtypes]) dtype = np.uint8 if alluint8 else np.uint16 k = len(self.factors) buf = np.empty((k, slicelen), dtype=dtype) def f(slices): n = len(slices[0]) for i in range(k): if dtypes[i] == dtype: buf[i, :n] = slices[i] else: buf[i, :n] = slices[i].astype(dtype) if n == slicelen: return buf else: return buf[:, :n] return Seq.map( f, Seq.zip(*[f.slicer(start, length, slicelen) for f in self.factors]))
def unique(self): slices = self.slicer(0, len(self), SLICELEN) # v, tail = Seq.try_read(slices) # set0 = set(v) # @jit(nopython=True, cache=True) # def f(acc, slice): # for v in slice: # if not np.isnan(v): # acc.add(v) # return acc # res = Seq.reduce(f, set0, tail) dt = types.float32 if self.slicer.dtype == np.float32 else types.float64 set0 = Dict.empty(key_type=dt, value_type=dt) res = Seq.reduce(f_unique, set0, slices) arr = np.array(list(res.keys()), dtype=self.slicer.dtype) arr.sort() return arr
def to_array(self): slices = self.slicer(0, len(self), len(self)) res, _ = Seq.try_read(slices) return res
def slicer(start, length, slicelen): length = min(self._length - start, length) slicelen = min(length, slicelen) buf = np.ones(slicelen, dtype = np.uint8) return Seq.map((lambda x: buf[x[0]:x[1]]), Seq.from_next((start, length, slicelen), ch.next_slice_indices))
def f(start, length, slicelen): buf = np.empty(slicelen, dtype=np.float32) return Seq.map((lambda s: g(s, buf)), vslicer(start, length, slicelen))
def slice(start, length, slicelen): buf = np.empty(slicelen, dtype = np.float32) return Seq.map((lambda slice: g_parse(slice, buf, parsed)), basefactor.slicer(start, length, slicelen))