def __init__(self, ticks): super(TimeModel, self).__init__() self.hourdist = DataItem([(i, 0) for i in range(6)]) self.wdaydist = DataItem([(i, 0) for i in range(7)]) self.mdaydist = DataItem([(i, 0) for i in range(32)]) if isinstance(ticks[0], datetime): for i in range(len(ticks)): ticks[i] = ticks[i].timetuple() for tick in ticks: self.hourdist[tick.tm_hour / 4] += 1 self.mdaydist[tick.tm_mday] += 1 self.wdaydist[tick.tm_wday] += 1 self.hourdist = norm_v1(self.hourdist) self.wdaydist = norm_v1(self.wdaydist) self.mdaydist = norm_v1(self.mdaydist)
def bgdist(dset): """get the back ground distribution @arg dset Dataset() of term vectors @return DataItem() of term -> tf values in the whole corpus """ dist = DataItem() for key in dset.iterkeys(): dist[key] = sum(dset[key]) return dist
def idf(dset): """get the idf distribution @arg dset Dataset() of term vectors @return DataItem() of term -> IDF values """ idfdist = DataItem() for key in dset.iterkeys(): idfdist[key] = count_non_zero(dset[key]) return idfdist
def token_freq(token_lst): """Return the token distribution""" dist = DataItem() for t in token_lst: if t not in dist: dist[t] = 1 else: dist[t] += 1 return dist
def norm_v1(ditem): """normalize values in a vector, normalization L_1 @arg dset DataItem() of vector @return Dataset() of vectors normalized """ nditem = DataItem() sumval = sum(ditem[key] for key in ditem) for key in ditem: nditem[key] = float(ditem[key]) / sumval return nditem
def norm_v2(dset): """normalize values in vector wise, row normalization (L_2) @arg dset Dataset() of vectors @return Dataset() of vectors normalized """ ndset = Dataset() for idx in range(dset.size()): sqrval = math.sqrt(sum(dset[key][idx]**2 for key in dset)) item = DataItem() for key in dset.iterkeys(): item[key] = dset[key][idx] / sqrval ndset.append(item) return ndset
def log_parse(src): """parse predication output from WEKA""" ins_lst = Dataset() with open(src) as fsrc: for line in fsrc: line, dummy = _SYMBOL.subn(' ', line) col = _SPACE.split(line) ins = DataItem() ins['ref'] = int((_CLSNO.split(col[2]))[0]) ins['refN'] = (_CLSNO.split(col[2]))[1] ins['prd'] = int((_CLSNO.split(col[3]))[0]) ins['prdN'] = (_CLSNO.split(col[3]))[1] ins['err'] = True if col[4] == '+' else False ins['score'] = [float(col[i]) for i in range(4, len(col) - 2)] ids, dummy = _PARATH.subn('', col[len(col) - 2]) ins['id'] = int(ids) ins_lst.append(ins) return ins_lst