def handle(self,tbl,opt): if opt.save_space is None: return for key,arr in tbl.items(): oformat = get_format(arr) nformat = get_bestfit(arr) if oformat != nformat: log.info('converting field %s from type %r to %r',key,oformat,nformat) tbl[key] = arr.astype(nformat) return tbl
def diffkeys_explore(self, key1, key2, dump_sample=0): '''return list of difference observations comparing two fields''' out = [] vals = self.fields(key1, key2) notequal = vals[0] != vals[1] diffcnt = notequal.sum() if diffcnt == 0: return out tcodes = [get_format(val) for val in vals] tchars = [tcode[0] for tcode in tcodes] # are all values within floating point fuzz? if not tchars.count('a') and na.allclose(*vals): out.append('no differences using allclose()') # real differences exist out.append('%s values differ (%1.2f%% of %s)' % (diffcnt, 100.0 * diffcnt / len(self), len(self))) if tcodes[0] != tcodes[1]: out.append('field types differ %s' % str(tuple(tcodes))) # skip detail if any field is alpha type if tchars.count('a'): return out # differences as different types? for typestr in ('Int', 'Bool'): tvals = [v.astype(typestr) for v in vals] if na.allclose(*tvals): out.append('field values match as type(%r)' % typestr) # extract differences and examine in greater detail dvals = [any_compress(notequal, val) for val in vals] nzmask = [(dval != 0) for dval in dvals] if (nzmask[0] != nzmask[1]).sum() == 0: # all zeros match, compare the nonzero values nzvals = [any_compress(nzmask[0], dval) for dval in dvals] ratio = nzvals[1].astype('Float') / nzvals[0].astype('Float') factor = ratio.mean() if na.allclose(ratio, factor): out.append('field values differ by constant factor: %f' % factor) delta = dvals[0] - dvals[1] dmin, dmax = delta.min(), delta.max() out.append('difference mean=%f range=%f (%s to %s)' % (delta.mean(), dmax - dmin, dmin, dmax)) if dump_sample: tmp = UniTable() tmp['_idx_'] = diffidx = na.nonzero(notequal)[0] for key in (key1, key2): tmp[key] = self.field(key)[notequal] if len(tmp) > dump_sample: tmp.resize(dump_sample) out.extend(str(tmp).split('\n')) return out
def diffkeys_explore(self,key1,key2,dump_sample=0): '''return list of difference observations comparing two fields''' out = [] vals = self.fields(key1,key2) notequal = vals[0] != vals[1] diffcnt = notequal.sum() if diffcnt == 0: return out tcodes = [get_format(val) for val in vals] tchars = [tcode[0] for tcode in tcodes] # are all values within floating point fuzz? if not tchars.count('a') and na.allclose(*vals): out.append('no differences using allclose()') # real differences exist out.append('%s values differ (%1.2f%% of %s)' % ( diffcnt,100.0*diffcnt/len(self),len(self))) if tcodes[0] != tcodes[1]: out.append('field types differ %s' % str(tuple(tcodes))) # skip detail if any field is alpha type if tchars.count('a'): return out # differences as different types? for typestr in ('Int','Bool'): tvals = [v.astype(typestr) for v in vals] if na.allclose(*tvals): out.append('field values match as type(%r)' % typestr) # extract differences and examine in greater detail dvals = [any_compress(notequal,val) for val in vals] nzmask = [(dval != 0) for dval in dvals] if (nzmask[0] != nzmask[1]).sum() == 0: # all zeros match, compare the nonzero values nzvals = [any_compress(nzmask[0],dval) for dval in dvals] ratio = nzvals[1].astype('Float') / nzvals[0].astype('Float') factor = ratio.mean() if na.allclose(ratio,factor): out.append('field values differ by constant factor: %f' % factor) delta = dvals[0] - dvals[1] dmin,dmax = delta.min(),delta.max() out.append('difference mean=%f range=%f (%s to %s)' % ( delta.mean(),dmax-dmin,dmin,dmax)) if dump_sample: tmp = UniTable() tmp['_idx_'] = diffidx = na.nonzero(notequal)[0] for key in (key1,key2): tmp[key] = self.field(key)[notequal] if len(tmp) > dump_sample: tmp.resize(dump_sample) out.extend(str(tmp).split('\n')) return out
def _iter_pptbl(self,sep='|',xsep='+',xfill='-',method=str,text='right',rhead=0): sizes = [] values = [export_string(value) for value in self.values()] for (name,col) in zip(self.keys(),values): # this is a lookahead on entire dataset to find max # print size for each field - try to short circuit where possible namesize = len(method(name)) fldsize = None try: if is_char_array(col): fldsize = col.maxLen() elif col is None: fldsize = len(method(None)) else: natype = get_format(col) is_float = natype.startswith('Float') if not is_float: nasize = col.itemsize() if nasize*3 <= namesize: fldsize = namesize else: minmax = min(col),max(col) fldsize = max(len(prtfld) for prtfld in it.imap(method,minmax)) except: pass if fldsize is None: # no shortcut found, convert entire column try: fldsize = max(len(prtfld) for prtfld in it.imap(method,col)) except: fldsize = 0 sizes.append(max(namesize,fldsize)) xbar = xsep + xsep.join([w*xfill for w in sizes]) + xsep out = [name.center(w) for (w,name) in zip(sizes,self.keys())] headline = sep + sep.join(out) + sep if text == 'left': for i,col in enumerate(values): try: if is_char_array(col): sizes[i] = -sizes[i] except: pass formats = ['%%%ss' % s for s in sizes] cols = [] for col in values: if col is None: col = [col]*len(self) cols.append(col) # finally, yield the result if not rhead: # if not repeating header, yield it first yield xbar yield headline yield xbar for rownum in range(len(self)): if rhead and rownum % rhead == 0: # repeat header at specified interval yield xbar yield headline yield xbar out = [(fmt % method(col[rownum])) for (fmt,col) in zip(formats,cols)] yield sep + sep.join(out) + sep yield xbar
def get_type_codes(self,arrs=None): if arrs is None: arrs = self.values() return [get_format(arr) for arr in arrs]