Example #1
0
 def receive_unitable(self):
   _csvargs={}
   if self.types is not None:
     _csvargs['types'] = self.types
   if self.sep is not None:
     _csvargs['insep'] = self.sep
   if self.header is not None:
     _csvargs['header'] = self.header
   try:
     if self.header is None:
       if self.sep is None:
         #No special treatment needed
         if ((len(_csvargs) == 0) and (self.ffConvert is None)):
           u = UniTable()
           if self.framing != 'EOF':
             #New size-framed stream
             u.fromfile(self.handle, bufferFramed=True, chunksize=self.chunksize)
             d = u.get_csv_dialect()
             self.sep = d.delimiter
             self.header = self.sep.join(u.keys())
           else:
             # Traditional file-framed:
             u.fromfile(self.handle, bufferFramed=False, chunksize=self.chunksize)
           return u
         elif self.ffConvert is not None:  
           # Jonathan's clean solution:
           fields = self.ffConvert.fields
           return UniTable().from_fixed_width_file(self.handle, fields)          
         else:
           return UniTable().from_csv_file(self.handle,**_csvargs)
       else:
         u = UniTable()
         if self.framing != 'EOF':
           #New size-framed stream
           u.fromfile(self.handle, bufferFramed=True, insep = self.sep, chunksize=self.chunksize, **_csvargs)
           self.header = self.sep.join(u.keys())
         else:
           # Traditional file-framed:
           u.fromfile(self.handle, insep = self.sep, bufferFramed=False, chunksize=self.chunksize, **_csvargs)
         return u
         #return UniTable().from_csv_file(self.handle, insep = self.sep, **_csvargs)
     else:
       if self.framing != 'EOF':
         # A header exists so a prior read has been made.
         if self.sep is None:
           return UniTable().from_csv_file(self.handle, bufferFramed=True, chunksize=self.chunksize, header = self.header, **_csvargs)
         else:
           return UniTable().from_csv_file(self.handle, bufferFramed=True, chunksize=self.chunksize, header = self.header, insep = self.sep, **_csvargs)
       else:
           return UniTable().from_csv_file(self.handle, bufferFramed=False, **_csvargs)
   except Exception, inst:
     #print "Exception is: {0}".format(type(inst))
     #print inst.args
     #print inst
     return None
Example #2
0
def top_ten(filenames):

    # track values for each field
    seen_fields = {}
    total_recs = 0

    # read each file in turn
    for filename in filenames:
        tbl = UniTable()
        tbl.fromfile(filename)

        keys = tbl.keys()[:]
        if '_count_' in keys:
            total_recs += tbl['_count_'].sum()
            keys.remove('_count_')
        else:
            total_recs += len(tbl)
            tbl['_count_'] = 1

        # read each column in turn
        for key in keys:
            seen_values = seen_fields.setdefault(key, {})

            # iterate over counts and values
            for cnt, value in izip(tbl['_count_'], tbl[key]):
                try:
                    seen_values[value] += cnt
                except KeyError:
                    seen_values[value] = cnt

    # report results
    for key, seen_values in seen_fields.items():

        # find top ten
        top_cnts = sorted(seen_values.values())
        cutoff = top_cnts[-10:][0]
        tmp = sorted([cnt, value] for (value, cnt) in seen_values.items()
                     if cnt >= cutoff)
        top = reversed(tmp[-10:])

        # report
        print 'Field:', key
        for (cnt, value) in top:
            percent = 100.0 * cnt / float(total_recs)
            print '\t(%8.5f%%) %r' % (percent, value)
Example #3
0
def top_ten(filenames):

  # track values for each field
  seen_fields = {}
  total_recs = 0

  # read each file in turn
  for filename in filenames:
    tbl = UniTable()
    tbl.fromfile(filename)

    keys = tbl.keys()[:]
    if '_count_' in keys:
      total_recs += tbl['_count_'].sum()
      keys.remove('_count_')
    else:
      total_recs += len(tbl)
      tbl['_count_'] = 1

    # read each column in turn
    for key in keys:
      seen_values = seen_fields.setdefault(key,{})

      # iterate over counts and values
      for cnt,value in izip(tbl['_count_'],tbl[key]):
        try:
          seen_values[value] += cnt
        except KeyError:
          seen_values[value] = cnt

  # report results
  for key,seen_values in seen_fields.items():

    # find top ten
    top_cnts = sorted(seen_values.values())
    cutoff = top_cnts[-10:][0]
    tmp = sorted([cnt,value] for (value,cnt) in seen_values.items() if cnt >= cutoff)
    top = reversed(tmp[-10:])

    # report
    print 'Field:', key
    for (cnt,value) in top:
      percent = 100.0*cnt/float(total_recs)
      print '\t(%8.5f%%) %r' % (percent,value)