class HistoTuple(EquivUnary): '''Histogram returned as list of (value,count) tuples ''' name = 'histo_tuple' ranking = ('iter_groupby', 'dict1') tests = ( Test([1, 2, 3, 4, 1, 2, 3, 1, 1, 1, 9]) == [(1, 5), (2, 2), (3, 2), (4, 1), (9, 1)], Test([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) == [(1, 11)], ) @staticmethod def iter_groupby(arg): arg = as_any_array(arg) return sorted([(k, len(list(g))) for k, g in groupby(sorted(arg))]) @staticmethod def dict1(arg, out=None): arg = as_any_array(arg) out = {} for val in arg: out[val] = out.get(val, 0) + 1 return sorted(out.items())
class DictMapDefault(EquivUnary): '''apply dictionary mapping to each element in array, using default for unmatched >>> func = DictMapDefault() >>> mapping = {1:100,2:200,3:300,'a':'aaa','b':'bbb'} >>> assert func([1,3,6,7,'a',9],mapping=mapping).tolist() == [100,300,6,7,'aaa',9] ''' name = 'dict_map_default' ranking = ('naive_comp', 'naive_loop',) tests = ( Test([1,3,6,7,7,9],mapping={1:100,2:200,3:300}) == [100,300,0,0,0,0], Test([1,3,6,7,'a',9],mapping={'1':100,'a':'aaa','3':300}) == ['100','300','0','0','aaa','0'], ) @staticmethod def naive_loop(arg,mapping={},default=0): arg = as_any_array(arg) out = [] for value in arg: out.append(mapping.get(value,default)) return as_any_array(out) @staticmethod def naive_comp(arg,mapping={},default=0): arg = as_any_array(arg) out = [mapping.get(value,default) for value in arg] return as_any_array(out)
class WindowRange(EquivUnary): """Get value range within sliding window on single data vector """ name = 'window_range' ranking = ('naive1', 'naive2') ranking = ('naive1', ) bench_sizes = (1, 2, 3, 4, 5, 10, 20, 50, 100, 200, 300, 500, 1000, 5000) tests = ( Test([0, 1, 2, 3]) == [0, 0, 0], Test([0, 1, 2, 3], step=2) == [0, 1, 1], Test(arange(20), sel=[0, 3, 5, 9, 20]) == [2, 1, 3, 9], ) @staticmethod def naive1(arg, sel=None, step=1): arg = as_num_array(arg) idx = arg_sel_step_to_idx(arg, sel, step) jj = idx[:-step] kk = idx[step:] segments = [arg[j + 1:k + 1] for j, k in izip(jj, kk) if j != k] omin = [seg.min() for seg in segments if len(seg)] omax = [seg.max() for seg in segments if len(seg)] return as_num_array(omax) - omin def naive2(arg, sel=None, step=1): arg = as_num_array(arg) idx = arg_sel_step_to_idx(arg, sel, step) jj = idx[:-step] kk = idx[step:] omin = [arg[j + 1:k + 1].min() for j, k in izip(jj, kk)] omax = [arg[j + 1:k + 1].max() for j, k in izip(jj, kk)] return as_num_array(omax) - omin
class RoundAll(EquivUnary): '''apply python round function (there is no equivalent ufunc) >>> func = RoundAll() >>> assert func([12.3456,30,456.12,0],ndigits=2).tolist() == [12.35,30.,456.12,0.] ''' name = 'round_all' ranking = ('roundint','naive') tests = ( Test([12.3456,30,456.12,0],ndigits=-1) == [10,30,460,0], Test([12.3456,30,456.12,0],ndigits=0) == [12,30,456,0], Test([12.3456,30,456.12,0],ndigits=1) == [12.3,30,456.1,0], Test([12.3456,30,456.12,0],ndigits=2) == [12.35,30,456.12,0], Test([-12.3456,-30,-456.12,0],ndigits=2) == [-12.35,-30,-456.12,0], ) @staticmethod def roundint(arg,ndigits=0): arg = as_num_array(arg) factor = 10**ndigits adjust = where(arg>=0,0.5,-0.5) out = ((arg*factor)+adjust).astype('Int') / float(factor) return out @staticmethod def naive(arg,ndigits=0): arg = as_num_array(arg) out = [round(val,ndigits) for val in arg] return as_num_array(out)
class MidpointsFloat(EquivUnary): """Return list of midpoints for all unique values in given vector. One typical use is to choose possible cutpoints when tree building. Note: returned vector has length of len(unique_values)-1. Note: this method applies to any numeric vector type and returns float vector. """ ranking = ('vector','naive') tests = ( Test([1.0]) == [], Test([2.0]) == [], Test([1,3,5]) == [2.0,4.0], Test([1,3,3]) == [2.0], Test([1.2,1.3,1.5]) == [1.25,1.4], ) @staticmethod def naive(arg): uniq = sorted(set(arg)) out = [] for i in range(len(uniq)-1): a,b = uniq[i:i+2] out.append((a+b)/2.0) return out @staticmethod def vector(arg): uniq = as_num_array(sorted(set(arg))) if len(uniq) <= 1: return [] return (uniq[1:]+uniq[:-1])/2.0
class Gini2(EquivBinary): """GINI on a pair of vectors """ itypes = ('i', 'i') name = 'gini2' ranking = ('simple2', 'simple1') tests = ( Test([1, 1, 1], [1, 1, 1]) == 0.0, Test([1, 1, 1], [2, 2, 2]) == 0.0, Test([1, 1, 1], [1, 1, 2])**0.222222222222, Test([1, 2, 3], [1, 2, 3])**0.666666666667, Test([1, 2, 3], [1, 2, 3, 4, 5])**0.75, ) @staticmethod def simple2(arg1, arg2): args = [as_num_array(arg) for arg in (arg1, arg2) if len(arg)] n = float(sum(len(arg) for arg in args)) return sum((gini(arg) * len(arg) / n) for arg in args) @staticmethod def simple1(arg1, arg2): gini = Gini() args = [as_num_array(arg) for arg in (arg1, arg2) if len(arg)] n = float(sum(len(arg) for arg in args)) return sum((gini(arg) * len(arg) / n) for arg in args)
class DeltaPrev(EquivUnary): '''difference from previous value >>> func = DeltaPrev() >>> assert func([1,3,6,7,7,9]).tolist() == [0,2,3,1,0,2] ''' name = 'delta_prev' ranking = ('smart','fast', 'naive_comp', 'naive_loop',) tests = ( Test([1,3,6,7,7,9]) == [0,2,3,1,0,2], Test([0,3,0,7,0,0]) == [0,3,-3,7,-7,0], Test([0]) == [0], ) @staticmethod def naive_loop(arg,out=None): arg = as_num_array(arg) if not out: out = arg.new() out[0] = 0 for i in xrange(1,len(arg)): out[i] = arg[i] - arg[i-1] return out @staticmethod def naive_comp(arg,out=None): arg = as_num_array(arg) if not out: out = arg.new() out[0] = 0 out[1:] = [arg[i]-arg[i-1] for i in xrange(1,len(arg))] return out @staticmethod def fast(arg,out=None): arg = as_num_array(arg) if not out: out = arg.new() out[0] = 0 subtract(arg[1:],arg[:-1],out[1:]) return out @classmethod def smart(self,arg,out=None): if len(arg) < 10: return self.naive_loop(arg,out) return self.fast(arg,out) @staticmethod def _check_result(out,arg,**kwargs): assert out[0] == 0 for o,a1,a2 in izip(out[1:],arg[1:],arg[:-1]): assert o == a1-a2 return True
def run(): loader = Loader() loader.load() # AMQP-0-9-1 adapter = loader.catalog['amqp-0-9-1'] test = Test(URL, adapter) test() # amqp adapter = loader.catalog['amqp'] test = Test(URL, adapter) test()
def run(): loader = Loader() loader.load() # amqp-1-0 adapter = loader.catalog['amqp-1-0'] test = Test(URL, adapter) test.test_crud() test.test_no_exchange() # proton adapter = loader.catalog['proton'] test = Test(URL, adapter) test.test_crud() test.test_no_exchange()
def run(): # AMQP-0-10 loader = Loader() loader.load() adapter = loader.catalog['amqp-0-10'] test = Test(URL, adapter) test() # qpid adapter = loader.catalog['qpid'] test = Test(URL, adapter) test() # qpid-messaging adapter = loader.catalog['qpid.messaging'] test = Test(URL, adapter) test()
class FuncMap(EquivUnary): '''apply function to each element in array >>> func = FuncMap() >>> mapping = {1:100,2:200,3:300,'a':'aaa','b':'bbb'} >>> assert func([1,3,6,7,7,9],func=mapping.get).tolist() == [100,300,6,7,7,9] ''' name = 'func_map' ranking = ('naive_comp', 'naive_loop',) tests = ( Test([1,3,6,7,7,9],func={1:100,2:200,3:300,6:6,7:7,9:9}.get) == [100,300,6,7,7,9], ) @staticmethod def _default_func(arg): return arg @staticmethod def naive_loop(arg,func=None): func = func or FuncMap._default_func arg = as_any_array(arg) out = [] for value in arg: out.append(func(value)) return as_any_array(out) @staticmethod def naive_comp(arg,func=None): func = func or FuncMap._default_func arg = as_any_array(arg) out = [func(value) for value in arg] return as_any_array(out)
class GaussianPdf(EquivUnary): '''Gaussian probability distribution function >>> func = GaussianPdf().gsl >>> assert allclose(func([1.2,0.1,0.5],variance=1.0),[0.19418605,0.39695255,0.35206533]) ''' name = 'gaussian_pdf' ranking = ('gsl',) tests = ( Test([1.2,0.1,0.5],variance=1.0) ** [0.19418605,0.39695255,0.35206533], ) @staticmethod def gsl(arg,mean=0.0,variance=0.0,out=None): arg = as_num_array(arg) if not out: out = arg.new() if mean == 0.0: out[:] = gaussian_pdf(arg-mean,variance) else: out[:] = gaussian_pdf(arg,variance) return out
class UniqueMask(EquivUnary): '''given sorted data, return mask selecting unique values. Typically used to reduce data with identical timestamps. By default, last value from a run of equal values is taken. ''' name = 'unique_mask' ranking = ( 'fast', 'naive_loop', ) tests = ( Test([0]) == [1], Test([0], first=True) == [1], Test([1, 2, 3, 7, 8, 9]) == [1, 1, 1, 1, 1, 1], Test([1, 2, 3, 7, 8, 9], first=True) == [1, 1, 1, 1, 1, 1], Test([1, 3, 3, 3, 4, 5]) == [1, 0, 0, 1, 1, 1], Test([1, 3, 3, 3, 4, 5], first=True) == [1, 1, 0, 0, 1, 1], Test([1, 1, 3, 3, 5, 5], first=False) == [0, 1, 0, 1, 0, 1], Test([1, 1, 3, 3, 5, 5], first=True) == [1, 0, 1, 0, 1, 0], ) @staticmethod def naive_loop(arg, first=False): arg = as_num_array(arg) if first: out = [1] for i in xrange(len(arg) - 1): if arg[i] != arg[i + 1]: out.append(1) else: out.append(0) else: out = [] for i in xrange(1, len(arg)): if arg[i] != arg[i - 1]: out.append(1) else: out.append(0) out.append(1) return as_num_array(out, type='Bool') @staticmethod def fast(arg, first=False): arg = as_num_array(arg) out = ones(len(arg), type='Bool') if first: reject = arg[1:] == arg[:-1] out[1:] -= reject else: reject = arg[:-1] == arg[1:] out[:-1] -= reject return out
class MidpointsInteger(EquivUnary): """Return list of midpoints for all unique values in given vector. One typical use is to choose possible cutpoints when tree building. Note: returned vector has length of len(unique_values)-1. Note: this method applies only to integer vectors and returns integer vector """ ranking = ('vector','naive') tests = ( Test([1]) == [], Test([2]) == [], Test([1,1,2]) == [2], Test([2,2,3]) == [3], Test([1,3,5]) == [2,4], Test([-5,-1,0,1,8,2,2]) == [-3,0,1,2,5], ) @staticmethod def naive(arg): uniq = sorted(set(arg)) out = [] for i in range(len(uniq)-1): a,b = uniq[i:i+2] out.append(((a+b)+1)//2) return out @staticmethod def vector(arg): uniq = as_num_array(sorted(set(arg))) if len(uniq) <= 1: return [] # assuming integer values and '<' op, the +1 causes round-up return (uniq[1:]+uniq[:-1]+1)//2
class RangeCap(EquivUnary): '''cap extreme values at specified limits >>> func = RangeCap() >>> assert func([1,3,-9,-3,6,7,9],lower=-5,upper=6).tolist() == [1,3,-5,-3,6,6,6] ''' name = 'range_cap' ranking = ('naive', 'clip') tests = ( Test([1, 3, -9, -3, 6, 7, 9, 0], lower=None, upper=None) == [1, 3, -9, -3, 6, 7, 9, 0], Test([1, 3, -9, -3, 6, 7, 9, 0], lower=5, upper=None) == [5, 5, 5, 5, 6, 7, 9, 5], Test([1, 3, -9, -3, 6, 7, 9, 0], lower=-5, upper=6) == [1, 3, -5, -3, 6, 6, 6, 0], ) @staticmethod def clip(arg, lower=None, upper=None): arg = as_num_array(arg) if lower is not None and upper is not None: arg = clip(arg, lower, upper) else: if lower is not None: arg = maximum(arg, lower) if upper is not None: arg = minimum(arg, upper) return arg @staticmethod def naive(arg, lower=None, upper=None): arg = as_num_array(arg) if lower is not None: arg = maximum(arg, lower) if upper is not None: arg = minimum(arg, upper) return arg
class Discretize(EquivUnary): """Discretize according to sorted segment table >>> func = Discretize() >>> b = as_num_array([1,10,50]) >>> v = as_num_array([0,5,35,100]) >>> data = as_num_array([100,75,50,25,2,1,0]) >>> print func(data,boundaries=b) [3 3 2 2 1 0 0] >>> print func(data,boundaries=b,values=v) [100 100 35 35 5 0 0] """ ranking = ('fast','loop') tests = ( Test([0],boundaries=[0]) == [0], Test([1,2,3,4,5],boundaries=[3]) == [0,0,0,1,1], Test([1,2,1.3,1.4,1.33],boundaries=[1.33]) == [0,1,0,1,0], Test([100,75,50,25,2,1,0],boundaries=[1,10,50]) == [3,3,2,2,1,0,0], Test([100,75,50,25,2,1,0],boundaries=[1,10,50],values=[0,5,35,100]) == [100,100,35,35,5,0,0], Test([100,-25,50,75,2,0,1],boundaries=[1,10,50],values=[0,5,35,100])== [100,0,35,100,5,0,0], Test([1,2,3,4,5],boundaries=[3,2,1]) == [0,0,0,3,3], # nonsense ) @staticmethod def fast(arg,boundaries=[0,100,1000],values=None): assert len(boundaries), "at least one boundary is required" if values is not None: assert len(boundaries)+1 == len(values), "len(values) must be len(boundaries)+1, (%s,%s)" % (len(values),len(boundaries)) idx = searchsorted(boundaries,arg) if values is None: return idx return as_num_array(values).take(idx) @staticmethod def loop(arg,boundaries=[0,100,1000],values=None): assert len(boundaries), "at least one boundary is required" if values is not None: assert len(boundaries)+1 == len(values), "len(values) must be len(boundaries)+1, (%s,%s)" % (len(values),len(boundaries)) idx = [] for value in arg: for i,x in it.izip(it.count(),boundaries): if value <= x: idx.append(i) break else: idx.append(len(boundaries)) if values is None: return idx return [values[i] for i in idx]
class UniformPdf(EquivUnary): '''Uniform probability distribution function >>> func = UniformPdf().gsl >>> assert allclose(func([1.2,0.1,0.5],b=1.0),[0.0,1.0,1.0]) ''' name = 'uniform_pdf' ranking = ('gsl', ) tests = (Test([1.2, 0.1, 0.5], b=1.0)**[0.0, 1.0, 1.0], ) @staticmethod def gsl(arg, a=0.0, b=0.0, out=None): arg = as_num_array(arg) if not out: out = arg.new() out[:] = flat_pdf(arg, a, b) return out
class ExponentialPdf(EquivUnary): '''Exponential probability distribution function >>> func = ExponentialPdf().gsl >>> assert allclose(func([1.2,0.1,0.5],mu=1.0),[0.30119421,0.90483742,0.60653066]) ''' name = 'exponential_pdf' ranking = ('gsl', ) tests = (Test([1.2, 0.1, 0.5], mu=1.0)**[0.30119421, 0.90483742, 0.60653066], ) @staticmethod def gsl(arg, mu=0.0, out=None): arg = as_num_array(arg) if not out: out = arg.new() out[:] = exponential_pdf(arg, mu) return out
class CusumReset(EquivUnary): '''CUSUM with reset algorithm >>> func = CusumReset().iterfunc >>> assert func([1,-3,6,7,-7,-9]).tolist() == [1,0,6,13,6,0] ''' name = 'cusum_reset' ranking = ('iterfunc', ) #ranking = ('iterfunc','iterloop') tests = (Test([1, -3, 6, 7, -7, -9]) == [1, 0, 6, 13, 6, 0], ) @staticmethod def iterfunc(arg, reset_value=0.0, out=None): def gen_cusum(data, reset_value=0.0): # no obvious way to vectorize this out = 0.0 for value in data: out = max(reset_value, out + value) yield out arg = as_num_array(arg) if out is None: out = arg.new() out[:] = list(gen_cusum(arg, reset_value)) return out @staticmethod def iterloop(arg, reset_value=0.0, out=None): arg = as_num_array(arg) if not out: out = arg.new() last = 0.0 for i, value in it.izip(it.count(), arg): out[i] = max(reset_value, last + value) return out
class CarryForward(EquivUnary): '''fill empty values with previous non-empty value >>> func = CarryForward().fast >>> assert func([0,3,0,0,4,0]).tolist() == [0,3,3,3,4,4] ''' name = 'carry_forward' ranking = ('naive_loop', 'naive_iter', 'array_idx') tests = ( Test([0,3,0,0,4,0]) == [0,3,3,3,4,4], Test([0,0,0,0,0,0]) == [0,0,0,0,0,0], Test([0,0,0,0,0,5]) == [0,0,0,0,0,5], Test([9,0,0,0,0,0]) == [9,9,9,9,9,9], ) @staticmethod def array_idx(arg,out=None): arg = as_num_array(arg) if not out: out = arg.new() idx = arg.nonzero()[0] try: first = idx[0] except IndexError: first = len(out) out[:first] = 0 if not len(idx): return out a,b = it.tee(idx) b.next() for start,stop in izip(a,b): out[start:stop] = arg[start] last = idx[-1] out[last:] = arg[last] return out @staticmethod def naive_iter(arg,out=None): arg = as_num_array(arg) if not out: out = arg.new() last = 0 for i,value in izip(it.count(),arg): if value != 0: out[i] = last = value else: out[i] = last return out @staticmethod def naive_loop(arg,out=None): arg = as_num_array(arg) if not out: out = arg.new() last = 0 for i in xrange(len(arg)): if arg[i] != 0: last = arg[i] out[i] = last return out
def run(): print URL test = Test(URL) test.test_crud() test.test_no_exchange()
class PartitionIntegerGini(EquivBinary): """Given a pair of dependent+independent integer vectors, consider all possible cutpoints in the independent vector, and return list of Gini scores for resulting splits in dependent. """ itypes = ('i', 'i') ranking = ('isort', 'deltacnt2', 'deltacnt1', 'deltacnt0', 'presort', 'idxsel', 'masksel', 'naive') bench_sizes = ( 1, 2, 3, 4, 5, 10, 20, 50, 100, 200, 300, 500, 1000, 5000, 10000, # the following sizes take too long to include in the default benchmarking run #50000, #100000, #500000, #1000000, ) tests = ( Test([1, 2]) == [0.0], Test([1, 2, 3])**([0.3333333] * 2), Test([1, 2, 3, 4])**([0.5] * 3), Test([1, 2, 3, 4, 5])**([0.5999999] * 4), Test([0, 1, 2, 3, 4])**([0.5999999] * 4), Test([1, 2, 3, 2, 1])**[0.2666666, 0.4], Test([3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])**[0.071428571428571425, 0.13186813186813176], Test([1, 2, 3, 4], [6, 7, 6, 8])**[0.5, 0.5], Test([1, 2, 3, 4, 5, 6], [6, 7, 6, 8, 9, 6])**([0.6666666] * 3), Test([1, 2, 1, 2, 1, 2], [6, 7, 8, 8, 9, 9])**[0.3999999, 0.5, 0.5], Test([1, 2, 1, 2, 1, 2], [4, 5, 6, 7, 8, 9])**[0.3999999, 0.5, 0.4444444, 0.5, 0.3999999], Test([1, 2, 1, 2, 1, 2], [4, 5, 6, 7, 8, 9], cutpoints=[7, 8])**[0.4444444, 0.5], ) def _prep_testdata(self, *args, **kwargs): # benchmark for inputs that are already vectors # simplification for tests: dep == indep out = [as_num_array(arg) for arg in args] if len(out) == 1: out.append(out[0].copy()) if not kwargs.get('dep_sorted'): idx = argsort(out[0]) out = [take(vec, idx) for vec in out] kwargs['dep_sorted'] = True return (out, kwargs) @staticmethod def smart(dep, indep, cutpoints=None, **kwargs): # not needed unless high penalty for small datasets dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) if len(dep) < 100: return PartitionIntegerGini.naive(dep, indep, cutpoints=cutpoints, **kwargs) return PartitionIntegerGini.isort(dep, indep, cutpoints=cutpoints, **kwargs) @staticmethod def isort(dep, indep, cutpoints=None, **kwargs): dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) if not len(cutpoints): return [] # sort both vectors by *indep* idx = argsort(indep) dep = take(dep, idx) indep = take(indep, idx) #print len(dep),len(indep),len(cutpoints) cutidx = [0, 0] for ival, isub in it.groupby(indep): ilen = len(list(isub)) if ival < cutpoints[len(cutidx) - 2]: cutidx[-1] += ilen else: if len(cutidx) > len(cutpoints): break cutidx.append(cutidx[-1] + ilen) assert len(cutidx) - 1 == len( cutpoints), '%s != %s' % (len(cutidx) - 1, len(cutpoints)) out = [] cnt2 = dict(histo_tuple(dep)) cnt1 = dict.fromkeys(cnt2.keys(), 0) for i1, i2 in izip(cutidx[:-1], cutidx[1:]): # update the counts from the last cut for d, cnt in histo_tuple(dep[i1:i2]): cnt1[d] += cnt cnt2[d] -= cnt # calculate results based on counts a1 = as_num_array([val for val in cnt1.itervalues() if val != 0]) a2 = as_num_array([val for val in cnt2.itervalues() if val != 0]) out.append(gini2_counts(a1, a2)) assert len(out) == len(cutpoints), '%s != %s' % (len(out), len(cutpoints)) return out @staticmethod def deltacnt2(dep, indep, cutpoints=None, **kwargs): dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] # dictionary of counts in each dataset cnt2 = dict(histo_tuple(dep)) cnt1 = dict.fromkeys(cnt2.keys(), 0) lastmask = (indep != indep) for cut in cutpoints: mask = indep < cut # examine only the new values from the last cut maskdelta = mask & ~lastmask lastmask |= mask idxdelta = nonzero(maskdelta)[0] # update the counts from the last cut for d, cnt in histo_tuple(dep[idxdelta]): cnt1[d] += cnt cnt2[d] -= cnt # calculate results based on counts a1 = as_num_array([val for val in cnt1.itervalues() if val != 0]) a2 = as_num_array([val for val in cnt2.itervalues() if val != 0]) out.append(gini2_counts(a1, a2)) return out @staticmethod def deltacnt1(dep, indep, cutpoints=None, **kwargs): dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] # dictionary of counts in each dataset cnt2 = dict(histo_tuple(dep)) cnt1 = dict.fromkeys(cnt2.keys(), 0) lastmask = (indep != indep) for cut in cutpoints: mask = indep < cut # examine only the new values from the last cut maskdelta = mask & ~lastmask lastmask |= mask idxdelta = nonzero(maskdelta)[0] # update the counts from the last cut for d in dep[idxdelta]: cnt1[d] += 1 cnt2[d] -= 1 # calculate results based on counts a1 = as_num_array([val for val in cnt1.itervalues() if val != 0]) a2 = as_num_array([val for val in cnt2.itervalues() if val != 0]) out.append(gini2_counts(a1, a2)) return out @staticmethod def deltacnt0(dep, indep, cutpoints=None, **kwargs): dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] # get vector of counts in each dataset tmp = histo_tuple(dep) dep_keys = dict((x[0], i) for i, x in enumerate(tmp)) cnt2 = as_num_array([x[1] for x in tmp]) cnt1 = cnt2 - cnt2 lastmask = (indep != indep) for cut in cutpoints: mask = indep < cut maskdelta = mask & ~lastmask lastmask |= mask for d in dep[maskdelta]: key = dep_keys[d] cnt1[key] += 1 cnt2[key] -= 1 a1 = cnt1[cnt1 != 0] a2 = cnt2[cnt2 != 0] out.append(gini2_counts(a1, a2)) return out @staticmethod def presort(dep, indep, cutpoints=None, dep_sorted=False): dep = as_num_array(dep) indep = as_num_array(indep) if not dep_sorted: idx = argsort(dep) dep = take(dep, idx) indep = take(indep, idx) if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] for cut in cutpoints: mask = indep < cut i1 = nonzero(mask)[0] i2 = nonzero(~mask)[0] a1 = dep[i1] a2 = dep[i2] out.append(gini2_presorted(a1, a2)) return out @staticmethod def idxsel(dep, indep, cutpoints=None, **kwargs): dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] for cut in cutpoints: mask = indep < cut i1 = nonzero(mask)[0] i2 = nonzero(~mask)[0] a1 = dep[i1] a2 = dep[i2] out.append(gini2(a1, a2)) return out @staticmethod def masksel(dep, indep, cutpoints=None, **kwargs): dep = as_num_array(dep) indep = as_num_array(indep) if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] for cut in cutpoints: mask = indep < cut a1 = dep[mask] a2 = dep[~mask] out.append(gini2(a1, a2)) return out @staticmethod def naive(dep, indep, cutpoints=None, **kwargs): if cutpoints is None: cutpoints = midpoints_integer(indep) out = [] for cut in cutpoints: a1 = [d for d, i in izip(dep, indep) if i < cut] a2 = [d for d, i in izip(dep, indep) if i >= cut] out.append(gini2(a1, a2)) return out
class WindowApply(EquivUnary): """Apply function to sliding window within single data vector """ name = 'window_apply' ranking = ('naive1', 'naive2') bench_sizes = (1, 2, 3, 4, 5, 10, 20, 50, 100, 200, 300, 500, 1000, 5000) tests = ( Test(arange(20),sel=[3,5,9,19]) == [6,9,30,145], Test(arange(10),sel=zeros(10,type='Bool'),step=1) == [], Test(arange(10),sel=ones(10,type='Bool'),step=1) \ == [0,1,2,3,4,5,6,7,8,9], Test(arange(10),sel=(arange(10)%2)==0,step=1) == [0,3,7,11,15], Test(arange(10),sel=(arange(10)%2)!=0,step=1) == [1,5,9,13,17], Test(arange(10),step=2) == [1,3,5,7,9,11,13,15,17], Test(arange(1,11),step=2) == [2,5,7,9,11,13,15,17,19], Test(arange(10),step=3) == [3,6,9,12,15,18,21,24], Test(arange(1,11),step=3) == [5,9,12,15,18,21,24,27], Test(arange(2,12),step=3) == [7,12,15,18,21,24,27,30], Test([0,1,2,3],func=sum) == [0,1,2,3], Test([0,1,2,3],step=2) == [1,3,5], #Test([0,1,2,3],func=minimum) == [0,1,2,3], ) def _prep_testdata_broken(self, *args, **kwargs): out = [as_num_array(arg) for arg in args] if not kwargs: # automatic test cases kwargs['sel'] = as_num_array(args[0]) == 0 return (out, kwargs) @staticmethod def naive1(arg, sel=None, step=1, func=sum): arg = as_num_array(arg) idx = arg_sel_step_to_idx(arg, sel, step) out = [] for i in xrange(len(idx) - step): j = idx[i] k = idx[i + step] chunk = arg[j + 1:k + 1] out.append(func(chunk)) return as_num_array(out) @staticmethod def naive2(arg, sel=None, step=1, func=sum): arg = as_num_array(arg) idx = arg_sel_step_to_idx(arg, sel, step) jj = idx[:-step] kk = idx[step:] out = [] for j, k in izip(jj, kk): chunk = arg[j + 1:k + 1] out.append(func(chunk)) return as_num_array(out)
class Gini(EquivUnary): """GINI on a single vector """ itypes = 'i' name = 'gini' ranking = ('groupby2', 'smart', 'loop3', 'groupby3', 'groupby1', 'loop2', 'loop1') tests = ( Test([0, 0, 0]) == 0.0, Test([1, 1, 1]) == 0.0, Test([0, 0, 1])**0.444444444444, Test([0, 1, 1])**0.444444444444, Test([1, 1, 2])**0.444444444444, Test([1, 2, 2])**0.444444444444, Test([1, 2, 3])**0.666666666667, Test([1, 2, 3, 4]) == 0.75, Test([4, 3, 2, 1]) == 0.75, Test([1, 2, 3, 2, 3, 3])**0.611111111111, Test([2, 1, 3, 200, 3000, 30000])**0.833333333333, Test(range(100))**0.99, Test(range(1000))**0.999, ) @staticmethod def smart(arg): arg = as_num_array(arg) if len(arg) < 50: return Gini.loop3(arg) return Gini.groupby2(arg) @staticmethod def groupby3(arg): arg = as_num_array(arg) n = float(len(arg)) gfx = as_num_array([len(list(g)) for k, g in groupby(sort(arg))]) / n gfx *= gfx out = 1.0 - gfx.sum() return out @staticmethod def groupby2(arg): arg = as_num_array(arg) n = float(len(arg)) gfx = [len(list(g)) / n for k, g in groupby(sorted(arg))] out = 1.0 for gf in gfx: out -= gf * gf return out @staticmethod def groupby1(arg): arg = as_num_array(arg) histo = [(k, len(list(g))) for k, g in groupby(sorted(arg))] n = float(len(arg)) out = 1.0 for (val, cnt) in histo: gf = cnt / n out -= gf * gf return out @staticmethod def loop3(arg): arg = as_num_array(arg) n = float(len(arg)) enum = {} for val in arg: enum[val] = 1 + enum.setdefault(val, 0) out = 1.0 for cnt in enum.itervalues(): gf = (cnt * 1.0) / n out -= gf * gf return out @staticmethod def loop2(arg): arg = as_num_array(arg) n = float(len(arg)) enum = {} for val in arg: cnt = enum.get(val, 0) enum[val] = cnt + 1 out = 1.0 for (val, cnt) in enum.iteritems(): gf = (cnt * 1.0) / n out -= gf * gf return out @staticmethod def loop1(arg): arg = as_num_array(arg) n = float(len(arg)) enum = {} for val in arg: cnt = enum.get(val, 0) enum[val] = cnt + 1 out = 1.0 for (val, cnt) in enum.items(): gf = (cnt * 1.0) / n out -= gf * gf return out
class GiniGain(EquivBinary): """GINI gain for a given split Note: this is not likely to be the most efficient method for tree building since it would result in repeated calculation of the gini() of the combined nodes. Better to have the application calculate that once and then loop over the possible splits. This method is intended more as documentation and as an additional test of the combination of the above methods. """ itypes = ('i', 'i') name = 'gini_gain' ranking = ('simple1', ) tests = ( Test([], [1, 1, 1, 1, 1, 1]) == 0.0, Test([1], [1, 1, 1, 1, 1]) == 0.0, Test([1, 1, 1], [1, 1, 1]) == 0.0, Test([1], [1, 1, 2, 2, 2])**0.1, Test([1, 1], [1, 2, 2, 2]) == 0.25, Test([1, 1, 1], [2, 2, 2]) == 0.5, Test([1, 1, 1, 2], [2, 2]) == 0.25, Test([1, 1, 1], [1, 1, 2])**0.055555555555, Test([1, 2, 3], [1, 2, 3]) == 0.0, Test([1, 2, 3], [1, 2, 3, 4, 5])**0.03125, Test([1, 3], [2, 999]) == 0.25, ) @staticmethod def simple1(arg1, arg2): gini1 = Gini() gini2 = Gini2() args = [as_num_array(arg) for arg in (arg1, arg2) if len(arg)] if len(args) != 2: return 0.0 return gini1(concatenate(args)) - gini2(*args)
class GetShift(EquivUnary): '''get value of other column shifted by some offset (typically useful for previous value (or arbitrary offset into column) >>> func = GetShift() >>> assert func([1,3,6,7,7,9],filler=99).tolist() == [3,6,7,7,9,99] ''' name = 'get_shift' ranking = ('smart','fast', 'naive_comp', 'naive_loop',) tests = ( Test([0]) == [0], Test([1,3,6,7,7,9]) == [0,1,3,6,7,7], Test([1,3,6,7,7,9],offset=3) == [0,0,0,1,3,6,], Test([1,3,6,7,7,9],offset=-2) == [6,7,7,9,0,0], Test([1,3,6,7,7,9],filler=99) == [99,1,3,6,7,7], ) @staticmethod def naive_loop(arg,offset=1,filler=0,out=None): arg = as_num_array(arg) if not out: out = arg.new() if offset < 0: cut = len(arg)+offset for i in xrange(cut): out[i] = arg[i-offset] for i in xrange(cut,len(arg)): out[i] = filler else: cut = offset for i in xrange(cut): out[i] = filler for i in xrange(cut,len(arg)): out[i] = arg[i-offset] return out @staticmethod def naive_comp(arg,offset=1,filler=0,out=None): arg = as_num_array(arg) if not out: out = arg.new() if offset < 0: cut = len(arg)+offset out[:cut] = [arg[i] for i in xrange(-offset,len(arg))] out[cut:] = [filler]*(-offset) else: cut = offset out[:cut] = [filler]*(offset) out[cut:] = [arg[i] for i in xrange(len(arg)-cut)] return out @staticmethod def fast(arg,offset=1,filler=0,out=None): arg = as_num_array(arg) if offset < 0: cut = len(arg)+offset out1 = arg[-offset:len(arg)] out2 = (-offset)*[filler] else: cut = offset out1 = offset*[filler] out2 = arg[:len(arg)-cut] if not out: out = arg.new() out[:cut] = out1 out[cut:] = out2 return out @classmethod def smart(self,arg,offset=1,filler=0,out=None): if len(arg) < 100: return self.naive_loop(arg,offset=offset,filler=filler,out=out) return self.fast(arg,offset=offset,filler=filler,out=out) @staticmethod def _check_result(out,arg,**kwargs): offset = kwargs.get('offset',1) filler = kwargs.get('filler',0) if offset < 0: cut = len(arg)+offset for i in xrange(cut): assert out[i] == arg[i-offset] for i in xrange(cut,len(arg)): assert out[i] == filler else: cut = offset for i in xrange(cut): assert out[i] == filler for i in xrange(cut,len(arg)): assert out[i] == arg[i-offset] return True
class LinearBins(EquivUnary): """Discretize to given number of equal sized bins, returning either 1) if idx==True, the bin number (0 to bins-1) for each value. 2) otherwise returning midpoint value for the range in each bin. This is mainly a convenience wrapper to Discretize(). """ ranking = ('wrapper',) tests = ( Test([0]) == [0], Test([0,2],bins=1) == [1.0,1.0], Test([0,2],bins=2) == [0.5,1.5], Test([0,2],bins=3) ** [0.333333,1.666666], Test([0,1,2],bins=3) ** [0.333333,1.0,1.666666], Test([1,2,3,4,5],bins=3) ** [1.666666,1.666666,3.0,4.333333,4.333333], Test([1,2,1.3,1.4,1.33],bins=2) == [1.25,1.75,1.25,1.25,1.25], Test([1,2,1.3,1.4,1.33],bins=2,min=0.5) == [0.875,1.625,1.625,1.625,1.625], Test([1,2,1.3,1.4,1.33],bins=2,min=-1) == [1.25,1.25,1.25,1.25,1.25], Test([0],idx=True) == [0], Test([0,2],bins=1,idx=True) == [0,0], Test([0,2],bins=2,idx=True) == [0,1], Test([0,2],bins=3,idx=True) == [0,2], Test([0,1,2],bins=3,idx=True) == [0,1,2], Test([1,2,3,4,5],bins=3,idx=True) == [0,0,1,2,2], Test([1,2,1.3,1.4,1.33],bins=2,idx=True) == [0,1,0,0,0], Test([1,2,1.3,1.4,1.33],bins=2,min=0.5,idx=True) == [0,1,1,1,1], Test([1,2,1.3,1.4,1.33],bins=2,min=-1,idx=True) == [1,1,1,1,1], Test([100,75,50,25,2,1,0],bins=35,max=500,idx=True) == [6,5,3,1,0,0,0], ) @staticmethod def wrapper(arg,bins=10,min=None,max=None,idx=None): arg = as_num_array(arg) if min is None: min = arg.min() if max is None: max = arg.max() step = (max - min) / bins if not step: return zeros(len(arg)) boundaries = arange(min+step,max+step,step) if idx: values = None else: values = arange(min+(step/2),max+step+step,step)[:len(boundaries)+1] return discretize(arg,boundaries=boundaries,values=values)
def run(): print(URL) test = Test(URL) test()
class LookAheadIndex(EquivUnary): """Given 1) a vector of sorted values that may contain duplicates and irregular spacing (like event timestamps, for example), and 2) a numeric delta to add to the vector (to identify the future timestamp some fixed interval away), return an index vector into the original data vector such that for each current value, the index points to either 1) the first occurrence of current value+delta, or 2) if the data doesn't contain such value, the first occurrence of the lext lower value that does occur in the data. If delta==0, then the index points to the first/last occurrence of the current value in the data. """ itypes = 'i' ranking = ('naive_loop',) ranking = ('missing1','naive_loop') bench_sizes = (1,2,3,4,5,10,20,50,100,200,300,500,1000,5000, #10000, #50000, #100000, #500000, #1000000, ) tests = ( Test([0,0,1,1,1,2,3,5,9],delta=0,first=True) == [0,0,2,2,2,5,6,7,8], Test([0,0,1,1,1,2,3,5,9],delta=1,first=True) == [2,2,5,5,5,6,6,7,8], Test([0,0,1,1,1,2,3,5,9],delta=2,first=True) == [5,5,6,6,6,6,7,7,8], Test([0,0,1,1,1,2,3,5,9],delta=3,first=True) == [6,6,6,6,6,7,7,7,8], Test([0,0,1,1,1,2,3,5,9],delta=4,first=True) == [6,6,7,7,7,7,7,8,8], Test([0,0,1,1,1,2,3,5,9],delta=5,first=True) == [7,7,7,7,7,7,7,8,8], Test([0,0,1,1,1,2,3,5,9],delta=0,first=False) == [1,1,4,4,4,5,6,7,8], Test([0,0,1,1,1,2,3,5,9],delta=1,first=False) == [4,4,5,5,5,6,6,7,8], Test([0,0,1,1,1,2,3,5,9],delta=2,first=False) == [5,5,6,6,6,6,7,7,8], Test([0,0,1,1,1,2,3,5,9],delta=3,first=False) == [6,6,6,6,6,7,7,7,8], Test([0,0,1,1,1,2,3,5,9],delta=4,first=False) == [6,6,7,7,7,7,7,8,8], Test([0,0,1,1,1,2,3,5,9],delta=5,first=False) == [7,7,7,7,7,7,7,8,8], Test([0,0,1,1,1,2,3,5,9],delta=-0,first=True) == [0,0,2,2,2,5,6,7,8], Test([0,0,1,1,1,2,3,5,9],delta=-1,first=True) == [0,0,0,0,0,2,5,6,7], Test([0,0,1,1,1,2,3,5,9],delta=-2,first=True) == [0,0,0,0,0,0,2,6,7], Test([0,0,1,1,1,2,3,5,9],delta=-3,first=True) == [0,0,0,0,0,0,0,5,7], Test([0,0,1,1,1,2,3,5,9],delta=-4,first=True) == [0,0,0,0,0,0,0,2,7], Test([0,0,1,1,1,2,3,5,9],delta=-5,first=True) == [0,0,0,0,0,0,0,0,6], Test([0,0,1,1,1,2,3,5,9],delta=-0,first=False) == [1,1,4,4,4,5,6,7,8], Test([0,0,1,1,1,2,3,5,9],delta=-1,first=False) == [1,1,1,1,1,4,5,6,7], Test([0,0,1,1,1,2,3,5,9],delta=-2,first=False) == [1,1,1,1,1,1,4,6,7], Test([0,0,1,1,1,2,3,5,9],delta=-3,first=False) == [1,1,1,1,1,1,1,5,7], Test([0,0,1,1,1,2,3,5,9],delta=-4,first=False) == [1,1,1,1,1,1,1,4,7], Test([0,0,1,1,1,2,3,5,9],delta=-5,first=False) == [1,1,1,1,1,1,1,1,6], Test([0],delta=-1) == [], Test([1,5],delta=-3) == [0,0], Test([0]) == [], Test([1,5],delta=3) == [0,1], ) def _prep_testdata(self,*args,**kwargs): # benchmark for inputs that are already vectors # simplification for tests: dep == indep return [as_num_array(sorted(arg)) for arg in args] @staticmethod def missing1(arg,delta=1,first=False): # build answer lookup mapping each arg value to first index run_lens = [(k,len(list(g))) for k,g in groupby(arg)] keys = as_num_array([k for k,l in run_lens]) lens = as_num_array([l for k,l in run_lens]) ends = cumsum(lens) starts = ends - lens if first: answer = dict(izip(keys,starts)) else: answer = dict(izip(keys,ends-1)) # identify missing keys need = keys + delta needset = set(need) haveset = set(answer) fillset = needset.difference(haveset) fill = as_num_array(sorted(fillset)) # #print #print 'haveset:', haveset #print 'need:', need #print 'fill:', fill #print 'answer1:', answer # minkey,maxkey = arg[0],arg[-1] # have_iter = iter(keys[-1::-1]) fill_iter = iter(fill[-1::-1]) thiskey = maxkey thisval = answer[thiskey] for fillkey in fill_iter: # print 'fillkey:', fillkey if thiskey >= fillkey: try: thiskey = dropwhile(lambda x:x>=fillkey,have_iter).next() except StopIteration: thiskey = minkey thisval = answer[thiskey] answer[fillkey] = thisval #print 'answer2:', answer out = [answer[val+delta] for val in arg] return out @staticmethod def naive_loop(arg,delta=1,first=False): out = [] for i,val in enumerate(arg): # find answer range target = val + delta jj = i if target > val: # look forward for j in xrange(i+1,len(arg)): if arg[j] > target: break else: j = len(arg) jj = j-1 target = arg[jj] elif target < val: # look backward for j in xrange(i-1,-1,-1): if arg[j] <= target: break else: j = 0 jj = j target = arg[jj] # find first or last answer within range if first: kk = 0 for k in xrange(jj,-1,-1): if arg[k] != target: kk = k+1 break else: kk = len(arg)-1 for k in xrange(jj,len(arg)): if arg[k] != target: kk = k-1 break out.append(kk) return out