Example #1
0
  def handle(self,tbl,opt):
    if opt.save_space is None:
      return

    for key,arr in tbl.items():
      oformat = get_format(arr)
      nformat = get_bestfit(arr)
      if oformat != nformat:
        log.info('converting field %s from type %r to %r',key,oformat,nformat)
        tbl[key] = arr.astype(nformat)

    return tbl
Example #2
0
  def handle(self,tbl,opt):
    if opt.save_space is None:
      return

    for key,arr in tbl.items():
      oformat = get_format(arr)
      nformat = get_bestfit(arr)
      if oformat != nformat:
        log.info('converting field %s from type %r to %r',key,oformat,nformat)
        tbl[key] = arr.astype(nformat)

    return tbl
Example #3
0
 def diffkeys_explore(self, key1, key2, dump_sample=0):
     '''return list of difference observations comparing two fields'''
     out = []
     vals = self.fields(key1, key2)
     notequal = vals[0] != vals[1]
     diffcnt = notequal.sum()
     if diffcnt == 0:
         return out
     tcodes = [get_format(val) for val in vals]
     tchars = [tcode[0] for tcode in tcodes]
     # are all values within floating point fuzz?
     if not tchars.count('a') and na.allclose(*vals):
         out.append('no differences using allclose()')
     # real differences exist
     out.append('%s values differ (%1.2f%% of %s)' %
                (diffcnt, 100.0 * diffcnt / len(self), len(self)))
     if tcodes[0] != tcodes[1]:
         out.append('field types differ %s' % str(tuple(tcodes)))
     # skip detail if any field is alpha type
     if tchars.count('a'):
         return out
     # differences as different types?
     for typestr in ('Int', 'Bool'):
         tvals = [v.astype(typestr) for v in vals]
         if na.allclose(*tvals):
             out.append('field values match as type(%r)' % typestr)
     # extract differences and examine in greater detail
     dvals = [any_compress(notequal, val) for val in vals]
     nzmask = [(dval != 0) for dval in dvals]
     if (nzmask[0] != nzmask[1]).sum() == 0:
         # all zeros match, compare the nonzero values
         nzvals = [any_compress(nzmask[0], dval) for dval in dvals]
         ratio = nzvals[1].astype('Float') / nzvals[0].astype('Float')
         factor = ratio.mean()
         if na.allclose(ratio, factor):
             out.append('field values differ by constant factor: %f' %
                        factor)
     delta = dvals[0] - dvals[1]
     dmin, dmax = delta.min(), delta.max()
     out.append('difference mean=%f range=%f (%s to %s)' %
                (delta.mean(), dmax - dmin, dmin, dmax))
     if dump_sample:
         tmp = UniTable()
         tmp['_idx_'] = diffidx = na.nonzero(notequal)[0]
         for key in (key1, key2):
             tmp[key] = self.field(key)[notequal]
         if len(tmp) > dump_sample:
             tmp.resize(dump_sample)
         out.extend(str(tmp).split('\n'))
     return out
Example #4
0
 def diffkeys_explore(self,key1,key2,dump_sample=0):
   '''return list of difference observations comparing two fields'''
   out = []
   vals = self.fields(key1,key2)
   notequal = vals[0] != vals[1]
   diffcnt = notequal.sum()
   if diffcnt == 0:
     return out
   tcodes = [get_format(val) for val in vals]
   tchars = [tcode[0] for tcode in tcodes]
   # are all values within floating point fuzz?
   if not tchars.count('a') and na.allclose(*vals):
     out.append('no differences using allclose()')
   # real differences exist
   out.append('%s values differ (%1.2f%% of %s)' % (
     diffcnt,100.0*diffcnt/len(self),len(self)))
   if tcodes[0] != tcodes[1]:
     out.append('field types differ %s' % str(tuple(tcodes)))
   # skip detail if any field is alpha type
   if tchars.count('a'):
     return out
   # differences as different types?
   for typestr in ('Int','Bool'):
     tvals = [v.astype(typestr) for v in vals]
     if na.allclose(*tvals):
       out.append('field values match as type(%r)' % typestr)
   # extract differences and examine in greater detail
   dvals = [any_compress(notequal,val) for val in vals]
   nzmask = [(dval != 0) for dval in dvals]
   if (nzmask[0] != nzmask[1]).sum() == 0:
     # all zeros match, compare the nonzero values
     nzvals = [any_compress(nzmask[0],dval) for dval in dvals]
     ratio = nzvals[1].astype('Float') / nzvals[0].astype('Float')
     factor = ratio.mean()
     if na.allclose(ratio,factor):
       out.append('field values differ by constant factor: %f' % factor)
   delta = dvals[0] - dvals[1]
   dmin,dmax = delta.min(),delta.max()
   out.append('difference mean=%f range=%f (%s to %s)' % (
       delta.mean(),dmax-dmin,dmin,dmax))
   if dump_sample:
     tmp = UniTable()
     tmp['_idx_'] = diffidx = na.nonzero(notequal)[0]
     for key in (key1,key2):
       tmp[key] = self.field(key)[notequal]
     if len(tmp) > dump_sample:
       tmp.resize(dump_sample)
     out.extend(str(tmp).split('\n'))
   return out
Example #5
0
  def _iter_pptbl(self,sep='|',xsep='+',xfill='-',method=str,text='right',rhead=0):
    sizes = []
    values = [export_string(value) for value in self.values()]
    for (name,col) in zip(self.keys(),values):
      # this is a lookahead on entire dataset to find max
      # print size for each field - try to short circuit where possible
      namesize = len(method(name))
      fldsize = None
      try:
        if is_char_array(col):
          fldsize = col.maxLen()
        elif col is None:
          fldsize = len(method(None))
        else:
          natype = get_format(col)
          is_float = natype.startswith('Float')
          if not is_float:
            nasize = col.itemsize()
            if nasize*3 <= namesize:
              fldsize = namesize
            else:
              minmax = min(col),max(col)
              fldsize = max(len(prtfld) for prtfld in it.imap(method,minmax))
      except:
        pass

      if fldsize is None:
        # no shortcut found, convert entire column
        try:
          fldsize = max(len(prtfld) for prtfld in it.imap(method,col))
        except:
          fldsize = 0
      sizes.append(max(namesize,fldsize))

    xbar = xsep + xsep.join([w*xfill for w in sizes]) + xsep
    out = [name.center(w) for (w,name) in zip(sizes,self.keys())]
    headline = sep + sep.join(out) + sep

    if text == 'left':
      for i,col in enumerate(values):
        try:
          if is_char_array(col):
            sizes[i] = -sizes[i]
        except:
          pass
    formats = ['%%%ss' % s for s in sizes]

    cols = []
    for col in values:
      if col is None:
        col = [col]*len(self)
      cols.append(col)

    # finally, yield the result

    if not rhead:
      # if not repeating header, yield it first
      yield xbar
      yield headline
      yield xbar

    for rownum in range(len(self)):
      if rhead and rownum % rhead == 0:
        # repeat header at specified interval
        yield xbar
        yield headline
        yield xbar
      out = [(fmt % method(col[rownum])) for (fmt,col) in zip(formats,cols)]
      yield sep + sep.join(out) + sep
    yield xbar
Example #6
0
 def get_type_codes(self,arrs=None):
   if arrs is None:
     arrs = self.values()
   return [get_format(arr) for arr in arrs]