def groupby3(arg): arg = as_num_array(arg) n = float(len(arg)) gfx = as_num_array([len(list(g)) for k, g in groupby(sort(arg))]) / n gfx *= gfx out = 1.0 - gfx.sum() return out
def isort(dep,indep,cutpoints=None,**kwargs): dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) if not len(cutpoints): return [] # sort both vectors by *indep* idx = argsort(indep) dep = take(dep,idx) indep = take(indep,idx) cutidx = [0,0] for ival,isub in it.groupby(indep): ilen = len(list(isub)) if ival < cutpoints[len(cutidx)-2]: cutidx[-1] += ilen else: if len(cutidx) > len(cutpoints): break cutidx.append(cutidx[-1]+ilen) assert len(cutidx)-1 == len(cutpoints), '%s != %s' % (len(cutidx)-1,len(cutpoints)) out = [] cnt2 = dict(histo_tuple(dep)) cnt1 = dict.fromkeys(cnt2.keys(),0) for i1,i2 in izip(cutidx[:-1],cutidx[1:]): # update the counts from the last cut for d,cnt in histo_tuple(dep[i1:i2]): cnt1[d] += cnt cnt2[d] -= cnt # calculate results based on counts a1 = as_num_array([val for val in cnt1.itervalues() if val != 0]) a2 = as_num_array([val for val in cnt2.itervalues() if val != 0]) out.append(gini2_counts(a1,a2)) assert len(out) == len(cutpoints), '%s != %s' % (len(out),len(cutpoints)) return out
def vector1(arg1, arg2): a1 = as_num_array(arg1) a2 = as_num_array(arg2) c1 = a1.sum() c2 = a2.sum() n = float(c1 + c2) return (gini_counts(a1) * c1 / n) + (gini_counts(a2) * c2 / n)
def missing1(arg,delta=1,first=False): # build answer lookup mapping each arg value to first index run_lens = [(k,len(list(g))) for k,g in groupby(arg)] keys = as_num_array([k for k,l in run_lens]) lens = as_num_array([l for k,l in run_lens]) ends = cumsum(lens) starts = ends - lens if first: answer = dict(izip(keys,starts)) else: answer = dict(izip(keys,ends-1)) # identify missing keys need = keys + delta needset = set(need) haveset = set(answer) fillset = needset.difference(haveset) fill = as_num_array(sorted(fillset)) # minkey,maxkey = arg[0],arg[-1] # have_iter = iter(keys[-1::-1]) fill_iter = iter(fill[-1::-1]) thiskey = maxkey thisval = answer[thiskey] for fillkey in fill_iter: if thiskey >= fillkey: try: thiskey = dropwhile(lambda x:x>=fillkey,have_iter).next() except StopIteration: thiskey = minkey thisval = answer[thiskey] answer[fillkey] = thisval out = [answer[val+delta] for val in arg] return out
def vector1(arg1,arg2): a1 = as_num_array(arg1) a2 = as_num_array(arg2) c1 = a1.sum() c2 = a2.sum() n = float(c1+c2) return (gini_counts(a1)*c1/n) + (gini_counts(a2)*c2/n)
def deltacnt1(dep,indep,cutpoints=None,**kwargs): dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] # dictionary of counts in each dataset cnt2 = dict(histo_tuple(dep)) cnt1 = dict.fromkeys(cnt2.keys(),0) lastmask = (indep != indep) for cut in cutpoints: mask = indep < cut # examine only the new values from the last cut maskdelta = mask & ~lastmask lastmask |= mask idxdelta = nonzero(maskdelta)[0] # update the counts from the last cut for d in dep[idxdelta]: cnt1[d] += 1 cnt2[d] -= 1 # calculate results based on counts a1 = as_num_array([val for val in cnt1.itervalues() if val != 0]) a2 = as_num_array([val for val in cnt2.itervalues() if val != 0]) out.append(gini2_counts(a1,a2)) return out
def deltacnt1(dep, indep, cutpoints=None, **kwargs): dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] # dictionary of counts in each dataset cnt2 = dict(histo_tuple(dep)) cnt1 = dict.fromkeys(cnt2.keys(), 0) lastmask = (indep != indep) for cut in cutpoints: mask = indep < cut # examine only the new values from the last cut maskdelta = mask & ~lastmask lastmask |= mask idxdelta = nonzero(maskdelta)[0] # update the counts from the last cut for d in dep[idxdelta]: cnt1[d] += 1 cnt2[d] -= 1 # calculate results based on counts a1 = as_num_array([val for val in cnt1.itervalues() if val != 0]) a2 = as_num_array([val for val in cnt2.itervalues() if val != 0]) out.append(gini2_counts(a1, a2)) return out
def groupby3(arg): arg = as_num_array(arg) n = float(len(arg)) gfx = as_num_array([len(list(g)) for k,g in groupby(sort(arg))])/n gfx *= gfx out = 1.0 - gfx.sum() return out
def naive2(arg,sel=None,step=1): arg = as_num_array(arg) idx = arg_sel_step_to_idx(arg,sel,step) jj = idx[:-step] kk = idx[step:] omin = [arg[j+1:k+1].min() for j,k in izip(jj,kk)] omax = [arg[j+1:k+1].max() for j,k in izip(jj,kk)] return as_num_array(omax) - omin
def naive2(arg, sel=None, step=1): arg = as_num_array(arg) idx = arg_sel_step_to_idx(arg, sel, step) jj = idx[:-step] kk = idx[step:] omin = [arg[j + 1:k + 1].min() for j, k in izip(jj, kk)] omax = [arg[j + 1:k + 1].max() for j, k in izip(jj, kk)] return as_num_array(omax) - omin
def smart(dep,indep,cutpoints=None,**kwargs): # not needed unless high penalty for small datasets dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) if len(dep) < 100: return PartitionIntegerGini.naive(dep,indep,cutpoints=cutpoints,**kwargs) return PartitionIntegerGini.isort(dep,indep,cutpoints=cutpoints,**kwargs)
def fast(arg1, arg2, reset_value=0.0, out=None): arg1 = as_num_array(arg1) arg2 = as_num_array(arg2) if not out: out = arg1.new() cusum_func = CusumReset().iterfunc log_odds = log(arg2 / arg1) cusum_func(log_odds, reset_value=reset_value, out=out) return out
def naive1(arg,sel=None,step=1): arg = as_num_array(arg) idx = arg_sel_step_to_idx(arg,sel,step) jj = idx[:-step] kk = idx[step:] segments = [arg[j+1:k+1] for j,k in izip(jj,kk) if j!=k] omin = [seg.min() for seg in segments if len(seg)] omax = [seg.max() for seg in segments if len(seg)] return as_num_array(omax) - omin
def naive1(arg, sel=None, step=1): arg = as_num_array(arg) idx = arg_sel_step_to_idx(arg, sel, step) jj = idx[:-step] kk = idx[step:] segments = [arg[j + 1:k + 1] for j, k in izip(jj, kk) if j != k] omin = [seg.min() for seg in segments if len(seg)] omax = [seg.max() for seg in segments if len(seg)] return as_num_array(omax) - omin
def naive2(arg,sel=None,step=1,func=sum): arg = as_num_array(arg) idx = arg_sel_step_to_idx(arg,sel,step) jj = idx[:-step] kk = idx[step:] out = [] for j,k in izip(jj,kk): chunk = arg[j+1:k+1] out.append(func(chunk)) return as_num_array(out)
def naive1(arg,sel=None,step=1,func=sum): arg = as_num_array(arg) idx = arg_sel_step_to_idx(arg,sel,step) out = [] for i in xrange(len(idx)-step): j = idx[i] k = idx[i+step] chunk = arg[j+1:k+1] out.append(func(chunk)) return as_num_array(out)
def naive1(arg, sel=None, step=1, func=sum): arg = as_num_array(arg) idx = arg_sel_step_to_idx(arg, sel, step) out = [] for i in xrange(len(idx) - step): j = idx[i] k = idx[i + step] chunk = arg[j + 1:k + 1] out.append(func(chunk)) return as_num_array(out)
def naive2(arg, sel=None, step=1, func=sum): arg = as_num_array(arg) idx = arg_sel_step_to_idx(arg, sel, step) jj = idx[:-step] kk = idx[step:] out = [] for j, k in izip(jj, kk): chunk = arg[j + 1:k + 1] out.append(func(chunk)) return as_num_array(out)
def masksel(dep,indep,cutpoints=None,**kwargs): dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] for cut in cutpoints: mask = indep < cut a1 = dep[mask] a2 = dep[~mask] out.append(gini2(a1,a2)) return out
def masksel(dep, indep, cutpoints=None, **kwargs): dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] for cut in cutpoints: mask = indep < cut a1 = dep[mask] a2 = dep[~mask] out.append(gini2(a1, a2)) return out
def idxsel(dep, indep, cutpoints=None, **kwargs): dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] for cut in cutpoints: mask = indep < cut i1 = nonzero(mask)[0] i2 = nonzero(~mask)[0] a1 = dep[i1] a2 = dep[i2] out.append(gini2(a1, a2)) return out
def idxsel(dep,indep,cutpoints=None,**kwargs): dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] for cut in cutpoints: mask = indep < cut i1 = nonzero(mask)[0] i2 = nonzero(~mask)[0] a1 = dep[i1] a2 = dep[i2] out.append(gini2(a1,a2)) return out
def smart(dep, indep, cutpoints=None, **kwargs): # not needed unless high penalty for small datasets dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) if len(dep) < 100: return PartitionIntegerGini.naive(dep, indep, cutpoints=cutpoints, **kwargs) return PartitionIntegerGini.isort(dep, indep, cutpoints=cutpoints, **kwargs)
def naive_comp(arg,out=None): arg = as_num_array(arg) if not out: out = arg.new() out[0] = 0 out[1:] = [arg[i]-arg[i-1] for i in xrange(1,len(arg))] return out
def fast(arg,out=None): arg = as_num_array(arg) if not out: out = arg.new() out[0] = 0 subtract(arg[1:],arg[:-1],out[1:]) return out
def simple1(arg1,arg2): gini1 = Gini() gini2 = Gini2() args = [as_num_array(arg) for arg in (arg1,arg2) if len(arg)] if len(args) != 2: return 0.0 return gini1(concatenate(args)) - gini2(*args)
def simple1(arg1, arg2): gini1 = Gini() gini2 = Gini2() args = [as_num_array(arg) for arg in (arg1, arg2) if len(arg)] if len(args) != 2: return 0.0 return gini1(concatenate(args)) - gini2(*args)
def naive(arg,lower=None,upper=None): arg = as_num_array(arg) if lower is not None: arg = maximum(arg,lower) if upper is not None: arg = minimum(arg,upper) return arg
def naive_loop(arg,out=None): arg = as_num_array(arg) if not out: out = arg.new() out[0] = 0 for i in xrange(1,len(arg)): out[i] = arg[i] - arg[i-1] return out
def iterloop(arg, reset_value=0.0, out=None): arg = as_num_array(arg) if not out: out = arg.new() last = 0.0 for i, value in it.izip(it.count(), arg): out[i] = max(reset_value, last + value) return out
def _prep_testdata(self,*args,**kwargs): out = [] for arg in args: enum = {} for val in arg: enum[val] = 1 + enum.setdefault(val,0) out.append(as_num_array(enum.values())) return out
def groupby2(arg): arg = as_num_array(arg) n = float(len(arg)) gfx = [len(list(g))/n for k,g in groupby(sorted(arg))] out = 1.0 for gf in gfx: out -= gf * gf return out
def groupby2(arg): arg = as_num_array(arg) n = float(len(arg)) gfx = [len(list(g)) / n for k, g in groupby(sorted(arg))] out = 1.0 for gf in gfx: out -= gf * gf return out
def _prep_testdata(self, *args, **kwargs): out = [] for arg in args: enum = {} for val in arg: enum[val] = 1 + enum.setdefault(val, 0) out.append(as_num_array(enum.values())) return out
def fast(arg,boundaries=[0,100,1000],values=None): assert len(boundaries), "at least one boundary is required" if values is not None: assert len(boundaries)+1 == len(values), "len(values) must be len(boundaries)+1, (%s,%s)" % (len(values),len(boundaries)) idx = searchsorted(boundaries,arg) if values is None: return idx return as_num_array(values).take(idx)
def presort(dep,indep,cutpoints=None,dep_sorted=False): dep = as_num_array(dep) indep = as_num_array(indep) if not dep_sorted: idx = argsort(dep) dep = take(dep,idx) indep = take(indep,idx) if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] for cut in cutpoints: mask = indep < cut i1 = nonzero(mask)[0] i2 = nonzero(~mask)[0] a1 = dep[i1] a2 = dep[i2] out.append(gini2_presorted(a1,a2)) return out
def presort(dep, indep, cutpoints=None, dep_sorted=False): dep = as_num_array(dep) indep = as_num_array(indep) if not dep_sorted: idx = argsort(dep) dep = take(dep, idx) indep = take(indep, idx) if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] for cut in cutpoints: mask = indep < cut i1 = nonzero(mask)[0] i2 = nonzero(~mask)[0] a1 = dep[i1] a2 = dep[i2] out.append(gini2_presorted(a1, a2)) return out
def naive_loop(arg,first=False): arg = as_num_array(arg) if first: out = [1] for i in xrange(len(arg)-1): if arg[i] != arg[i+1]: out.append(1) else: out.append(0) else: out = [] for i in xrange(1,len(arg)): if arg[i] != arg[i-1]: out.append(1) else: out.append(0) out.append(1) return as_num_array(out,type='Bool')
def groupby1(arg): arg = as_num_array(arg) histo = [(k,len(list(g))) for k,g in groupby(sorted(arg))] n = float(len(arg)) out = 1.0 for (val,cnt) in histo: gf = cnt/n out -= gf * gf return out
def gsl(arg, mean=0.0, variance=0.0, out=None): arg = as_num_array(arg) if not out: out = arg.new() if mean == 0.0: out[:] = gaussian_pdf(arg - mean, variance) else: out[:] = gaussian_pdf(arg, variance) return out
def gsl(arg,mean=0.0,variance=0.0,out=None): arg = as_num_array(arg) if not out: out = arg.new() if mean == 0.0: out[:] = gaussian_pdf(arg-mean,variance) else: out[:] = gaussian_pdf(arg,variance) return out
def groupby1(arg): arg = as_num_array(arg) histo = [(k, len(list(g))) for k, g in groupby(sorted(arg))] n = float(len(arg)) out = 1.0 for (val, cnt) in histo: gf = cnt / n out -= gf * gf return out
def naive_loop(arg, first=False): arg = as_num_array(arg) if first: out = [1] for i in xrange(len(arg) - 1): if arg[i] != arg[i + 1]: out.append(1) else: out.append(0) else: out = [] for i in xrange(1, len(arg)): if arg[i] != arg[i - 1]: out.append(1) else: out.append(0) out.append(1) return as_num_array(out, type='Bool')
def naive_loop(arg,out=None): arg = as_num_array(arg) if not out: out = arg.new() last = 0 for i in xrange(len(arg)): if arg[i] != 0: last = arg[i] out[i] = last return out
def fast(arg,first=False): arg = as_num_array(arg) out = ones(len(arg),type='Bool') if first: reject = arg[1:] == arg[:-1] out[1:] -= reject else: reject = arg[:-1] == arg[1:] out[:-1] -= reject return out
def clip(arg,lower=None,upper=None): arg = as_num_array(arg) if lower is not None and upper is not None: arg = clip(arg,lower,upper) else: if lower is not None: arg = maximum(arg,lower) if upper is not None: arg = minimum(arg,upper) return arg
def fast(arg, first=False): arg = as_num_array(arg) out = ones(len(arg), type='Bool') if first: reject = arg[1:] == arg[:-1] out[1:] -= reject else: reject = arg[:-1] == arg[1:] out[:-1] -= reject return out
def _prep_testdata(self, *args, **kwargs): # benchmark for inputs that are already vectors # simplification for tests: dep == indep out = [as_num_array(arg) for arg in args] if len(out) == 1: out.append(out[0].copy()) if not kwargs.get('dep_sorted'): idx = argsort(out[0]) out = [take(vec, idx) for vec in out] kwargs['dep_sorted'] = True return (out, kwargs)
def loop3(arg): arg = as_num_array(arg) n = float(len(arg)) enum = {} for val in arg: enum[val] = 1 + enum.setdefault(val,0) out = 1.0 for cnt in enum.itervalues(): gf = (cnt*1.0)/n out -= gf * gf return out
def naive_iter(arg,out=None): arg = as_num_array(arg) if not out: out = arg.new() last = 0 for i,value in izip(it.count(),arg): if value != 0: out[i] = last = value else: out[i] = last return out