def receive_unitable(self): _csvargs={} if self.types is not None: _csvargs['types'] = self.types if self.sep is not None: _csvargs['insep'] = self.sep if self.header is not None: _csvargs['header'] = self.header try: if self.header is None: if self.sep is None: #No special treatment needed if ((len(_csvargs) == 0) and (self.ffConvert is None)): u = UniTable() if self.framing != 'EOF': #New size-framed stream u.fromfile(self.handle, bufferFramed=True, chunksize=self.chunksize) d = u.get_csv_dialect() self.sep = d.delimiter self.header = self.sep.join(u.keys()) else: # Traditional file-framed: u.fromfile(self.handle, bufferFramed=False, chunksize=self.chunksize) return u elif self.ffConvert is not None: # Jonathan's clean solution: fields = self.ffConvert.fields return UniTable().from_fixed_width_file(self.handle, fields) else: return UniTable().from_csv_file(self.handle,**_csvargs) else: u = UniTable() if self.framing != 'EOF': #New size-framed stream u.fromfile(self.handle, bufferFramed=True, insep = self.sep, chunksize=self.chunksize, **_csvargs) self.header = self.sep.join(u.keys()) else: # Traditional file-framed: u.fromfile(self.handle, insep = self.sep, bufferFramed=False, chunksize=self.chunksize, **_csvargs) return u #return UniTable().from_csv_file(self.handle, insep = self.sep, **_csvargs) else: if self.framing != 'EOF': # A header exists so a prior read has been made. if self.sep is None: return UniTable().from_csv_file(self.handle, bufferFramed=True, chunksize=self.chunksize, header = self.header, **_csvargs) else: return UniTable().from_csv_file(self.handle, bufferFramed=True, chunksize=self.chunksize, header = self.header, insep = self.sep, **_csvargs) else: return UniTable().from_csv_file(self.handle, bufferFramed=False, **_csvargs) except Exception, inst: #print "Exception is: {0}".format(type(inst)) #print inst.args #print inst return None
def top_ten(filenames): # track values for each field seen_fields = {} total_recs = 0 # read each file in turn for filename in filenames: tbl = UniTable() tbl.fromfile(filename) keys = tbl.keys()[:] if '_count_' in keys: total_recs += tbl['_count_'].sum() keys.remove('_count_') else: total_recs += len(tbl) tbl['_count_'] = 1 # read each column in turn for key in keys: seen_values = seen_fields.setdefault(key, {}) # iterate over counts and values for cnt, value in izip(tbl['_count_'], tbl[key]): try: seen_values[value] += cnt except KeyError: seen_values[value] = cnt # report results for key, seen_values in seen_fields.items(): # find top ten top_cnts = sorted(seen_values.values()) cutoff = top_cnts[-10:][0] tmp = sorted([cnt, value] for (value, cnt) in seen_values.items() if cnt >= cutoff) top = reversed(tmp[-10:]) # report print 'Field:', key for (cnt, value) in top: percent = 100.0 * cnt / float(total_recs) print '\t(%8.5f%%) %r' % (percent, value)
def top_ten(filenames): # track values for each field seen_fields = {} total_recs = 0 # read each file in turn for filename in filenames: tbl = UniTable() tbl.fromfile(filename) keys = tbl.keys()[:] if '_count_' in keys: total_recs += tbl['_count_'].sum() keys.remove('_count_') else: total_recs += len(tbl) tbl['_count_'] = 1 # read each column in turn for key in keys: seen_values = seen_fields.setdefault(key,{}) # iterate over counts and values for cnt,value in izip(tbl['_count_'],tbl[key]): try: seen_values[value] += cnt except KeyError: seen_values[value] = cnt # report results for key,seen_values in seen_fields.items(): # find top ten top_cnts = sorted(seen_values.values()) cutoff = top_cnts[-10:][0] tmp = sorted([cnt,value] for (value,cnt) in seen_values.items() if cnt >= cutoff) top = reversed(tmp[-10:]) # report print 'Field:', key for (cnt,value) in top: percent = 100.0*cnt/float(total_recs) print '\t(%8.5f%%) %r' % (percent,value)