Esempio n. 1
0
 def __init__(self, ticks):
     super(TimeModel, self).__init__()
     self.hourdist = DataItem([(i, 0) for i in range(6)])
     self.wdaydist = DataItem([(i, 0) for i in range(7)])
     self.mdaydist = DataItem([(i, 0) for i in range(32)])
     if isinstance(ticks[0], datetime):
         for i in range(len(ticks)):
             ticks[i] = ticks[i].timetuple()
     for tick in ticks:
         self.hourdist[tick.tm_hour / 4] += 1
         self.mdaydist[tick.tm_mday] += 1
         self.wdaydist[tick.tm_wday] += 1
     self.hourdist = norm_v1(self.hourdist)
     self.wdaydist = norm_v1(self.wdaydist)
     self.mdaydist = norm_v1(self.mdaydist)
Esempio n. 2
0
def bgdist(dset):
    """get the back ground distribution
        @arg dset Dataset() of term vectors
        @return DataItem() of term -> tf values in the whole corpus
    """
    dist = DataItem()
    for key in dset.iterkeys():
        dist[key] = sum(dset[key])
    return dist
Esempio n. 3
0
def idf(dset):
    """get the idf distribution
        @arg dset Dataset() of term vectors
        @return DataItem() of term -> IDF values
    """
    idfdist = DataItem()
    for key in dset.iterkeys():
        idfdist[key] = count_non_zero(dset[key])
    return idfdist
Esempio n. 4
0
def token_freq(token_lst):
    """Return the token distribution"""
    dist = DataItem()
    for t in token_lst:
        if t not in dist:
            dist[t] = 1
        else:
            dist[t] += 1
    return dist
Esempio n. 5
0
def norm_v1(ditem):
    """normalize values in a vector, normalization L_1
        @arg dset DataItem() of vector
        @return Dataset() of vectors normalized
    """
    nditem = DataItem()
    sumval = sum(ditem[key] for key in ditem)
    for key in ditem:
        nditem[key] = float(ditem[key]) / sumval
    return nditem
Esempio n. 6
0
def norm_v2(dset):
    """normalize values in vector wise, row normalization (L_2)
        @arg dset Dataset() of vectors
        @return Dataset() of vectors normalized
    """
    ndset = Dataset()
    for idx in range(dset.size()):
        sqrval = math.sqrt(sum(dset[key][idx]**2 for key in dset))
        item = DataItem()
        for key in dset.iterkeys():
            item[key] = dset[key][idx] / sqrval
        ndset.append(item)
    return ndset
Esempio n. 7
0
def log_parse(src):
    """parse predication output from WEKA"""
    ins_lst = Dataset()
    with open(src) as fsrc:
        for line in fsrc:
            line, dummy = _SYMBOL.subn(' ', line)
            col = _SPACE.split(line)
            ins = DataItem()
            ins['ref'] = int((_CLSNO.split(col[2]))[0])
            ins['refN'] = (_CLSNO.split(col[2]))[1]
            ins['prd'] = int((_CLSNO.split(col[3]))[0])
            ins['prdN'] = (_CLSNO.split(col[3]))[1]
            ins['err'] = True if col[4] == '+' else False
            ins['score'] = [float(col[i]) for i in range(4, len(col) - 2)]
            ids, dummy = _PARATH.subn('', col[len(col) - 2])
            ins['id'] = int(ids)
            ins_lst.append(ins)
    return ins_lst