Esempio n. 1
0

# store the total time needed to compute distance matrix
matrixCompTotal = 0
# store the total time needed to compute modularity
modularityTotal = 0
# store the total time needed to split clusters
splitTotal = 0
# store the total time needed to compute cluster diameter
diaTotal = 0
# store the total time needed to find out feature exclusion
excluTotal = 0

if __name__ == '__main__':

    ngramPath = sys.argv[1]
    outPath = sys.argv[2]
    sizeThreshold = float(sys.argv[3]) if len(sys.argv) > 3 else 0.05

    startTime = time.time()
    sid_seq = rhc.getSidNgramMap(ngramPath)
    print('[LOG]: total users %d' % len(sid_seq))
    result = run(ngramPath, sid_seq, outPath)

    json.dump(result, open('%sresult.json' % outPath, 'w'))

    print('[STAT]: total time %f' % (time.time() - startTime))
    print(('[STAT]: maxtrix com: %f, dismeter: %f, modularity: %f, split: %f, '
           'exclusion: %f') %
          (matrixCompTotal, diaTotal, modularityTotal, splitTotal, excluTotal))
"""

import json
import sys
import numpy as np
import recursiveHierarchicalClustering as rhc

inPath = sys.argv[1]
sid_ngram = sys.argv[2]
outPath = sys.argv[3]

# display 24 bins in visulization
binCount = 24

data = json.load(open(inPath))
sid_seq = rhc.getSidNgramMap(sid_ngram)


def allUser(tree):
    """ output all users in the tree """
    users = []
    if (tree[0] == 'l'):
        return tree[1]
    for subTree in tree[1]:
        users.extend(allUser(subTree))
    return users


def getPatternDist(pattern, sids, sid_seq):
    """
    get pattern distribution