def __init__(self, filename, target, inputs=None, threshold=1.0e-9): CountsBase.__init__(self, threshold=threshold) if isinstance(filename, UniTable): data = filename else: data = UniTable().fromfile(filename) self.model_build(data, target, inputs) self.verify_result = verify = UniTable() verify['orig'] = data[self.target] verify['pred'] = self.model_predict(data) verify['agree'] = verify['orig'] == verify['pred'] self.accuracy = float(verify['agree'].sum()) / len(data)
def __str__(self): out = UniTable() out['key'] = self.keys() out['name'] = self.names() out['expr'] = self.values() out['rule'] = self.rules() return str(out)
def __str__(self): out = UniTable() out['(#)'] = list(self.rows) + ['_totals_'] col_sums = self.col_sums() for i,col in enumerate(self.cols): out[col] = list(self.matrix[:,i]) + [col_sums[i]] out['_totals_'] = list(self.row_sums()) + [self.sum()] return str(out)
def handle_select(self, opt, tbl): fldexpr = FieldExprList(*opt.select) rules = fldexpr.rules() tbl = EvalTable(rules).update(tbl) out = UniTable() for key, name in zip(fldexpr.keys(), fldexpr.names()): out[name] = tbl[key] return out
def export(self): out = UniTable() out['(#)'] = list(self.rownames) + ['_totals_'] col_sums = self.col_sums() for i, col in enumerate(self.colnames): out[col] = list(self.matrix[:, i]) + [col_sums[i]] out['_totals_'] = list(self.row_sums()) + [self.sum()] return out
def top_ten(filenames): # track values for each field seen_fields = {} total_recs = 0 # read each file in turn for filename in filenames: tbl = UniTable() tbl.fromfile(filename) keys = tbl.keys()[:] if '_count_' in keys: total_recs += tbl['_count_'].sum() keys.remove('_count_') else: total_recs += len(tbl) tbl['_count_'] = 1 # read each column in turn for key in keys: seen_values = seen_fields.setdefault(key, {}) # iterate over counts and values for cnt, value in izip(tbl['_count_'], tbl[key]): try: seen_values[value] += cnt except KeyError: seen_values[value] = cnt # report results for key, seen_values in seen_fields.items(): # find top ten top_cnts = sorted(seen_values.values()) cutoff = top_cnts[-10:][0] tmp = sorted([cnt, value] for (value, cnt) in seen_values.items() if cnt >= cutoff) top = reversed(tmp[-10:]) # report print 'Field:', key for (cnt, value) in top: percent = 100.0 * cnt / float(total_recs) print '\t(%8.5f%%) %r' % (percent, value)
def top_ten(filenames): # track values for each field seen_fields = {} total_recs = 0 # read each file in turn for filename in filenames: tbl = UniTable() tbl.fromfile(filename) keys = tbl.keys()[:] if '_count_' in keys: total_recs += tbl['_count_'].sum() keys.remove('_count_') else: total_recs += len(tbl) tbl['_count_'] = 1 # read each column in turn for key in keys: seen_values = seen_fields.setdefault(key,{}) # iterate over counts and values for cnt,value in izip(tbl['_count_'],tbl[key]): try: seen_values[value] += cnt except KeyError: seen_values[value] = cnt # report results for key,seen_values in seen_fields.items(): # find top ten top_cnts = sorted(seen_values.values()) cutoff = top_cnts[-10:][0] tmp = sorted([cnt,value] for (value,cnt) in seen_values.items() if cnt >= cutoff) top = reversed(tmp[-10:]) # report print 'Field:', key for (cnt,value) in top: percent = 100.0*cnt/float(total_recs) print '\t(%8.5f%%) %r' % (percent,value)
def __init__(self,filename=None,keys=[]): self.keys = keys self.data = data = {'':0} # try to pre-assign empty string value self.filename = filename if filename and os.path.exists(filename): from augustus.unitable import UniTable tbl = UniTable().fromfile(filename) for i,value in it.izip(tbl['index'],tbl['data']): data[value] = i del tbl
def __call__(self,data): state = self._state = UniTable() state['data'] = data state['nullmodel'] = self.nullmodel(state['data']) state['altmodel'] = self.altmodel(state['data']) state['odds'] = state['altmodel']/state['nullmodel'] state['log_odds'] = na.log(state['odds']) state['cusum'] = list(gen_cusum(state['log_odds'],self.reset_value)) state['score'] = state['cusum'] >self.threshold return state['score'][-1]
def _make_tbl(self, cfunc, ccfunc): out = UniTable() ikvlist = list(self.iter_ikv()) out['__fld__'] = [''] + [ikv[0] for ikv in ikvlist] out['__val__'] = [''] + [ikv[1] for ikv in ikvlist] for tval in self.all_tval(): value = cfunc(tval) ikv_vals = [ccfunc(tval, ikey, ival) for (ikey, ival) in ikvlist] out[str(tval)] = [value] + ikv_vals return str(out)
def flush(self): if self.filename and len(self.data) > 1: from augustus.unitable import UniTable tbl = UniTable(keys=['index','data']) tmp = self.data.items() tbl['index'] = [x[1] for x in tmp] tbl['data'] = [x[0] for x in tmp] del tmp tbl.sort_on('index') tbl.to_csv_file(self.filename) del tbl
def handle_arg(self, opt, arg): tbl = UniTable().fromfile(arg) if opt.select: tbl = self.handle_select(opt, tbl) tbl = self.handle_counttable(opt, arg, tbl) print tbl.export().to_csv_str()
def handle_arg(self,opt,arg): tbl = UniTable().fromfile(arg) if opt.select: tbl = self.handle_select(opt,tbl) tbl = self.handle_counttable(opt,arg,tbl) print tbl.export().to_csv_str()
def __init__(self,nullmodel,altmodel,threshold,reset_value=0.0): self.nullmodel = nullmodel self.altmodel = altmodel self.threshold = threshold self.reset_value = reset_value self._state = UniTable()