Beispiel #1
0
def cluster(parser, k):
    """
    general method for clustering data
    """
    
    #get index number for every page
    code_book = parser.get_data_encoding(page_min_occurance=5)
    
    #use only sequence of pages visited
    simple_session = [session for session in parser.get_simple_sessions() if config.session_filter_fn(session)]
    
    #use vector representation (v1,v2,v2) where v1 means page v1 was visited    
    #models = session_modeling.convert_sessions_to_vector(simple_session, code_book, binary=True)
    
    #construct markov chains, estimate transition probabilities
    models = session_modeling.convert_sessions_to_markov(simple_session, code_book, bayes=False)
    idx, sse, _ = Pycluster.kcluster(models, k, method='a', dist='e')
 
    #idx, sse, _ = cluster_kmedoids(models, k, string_similarity.jaccard_distance)
    

    clusters = {}
    for name, clusterid in zip(simple_session, idx):
        clusters.setdefault(clusterid, []).append(name)
    
    return clusters, sse
Beispiel #2
0
def deocarate_timings(patterns, log, total):
    """ return a list of average times spent on each page, based on transaction"""
    transactions = log.sessions
    patterns = sorted(patterns, key=operator.itemgetter(1), reverse=True)
    timings = {}
    for trans in transactions.values():
        if not config.session_filter_fn(trans): continue
        i = 0
        for pattern, support in patterns:
            pos = contains(trans, pattern)
            if pos !=-1:
                timings.setdefault(i, Timing(pattern, support)).sum(pos, trans)
            i+=1

    return timings
Beispiel #3
0
def deocarate_timings(patterns, log, total):
    """ return a list of average times spent on each page, based on transaction"""
    transactions = log.sessions
    patterns = sorted(patterns, key=operator.itemgetter(1), reverse=True)
    timings = {}
    for trans in transactions.values():
        if not config.session_filter_fn(trans): continue
        i = 0
        for pattern, support in patterns:
            pos = contains(trans, pattern)
            if pos != -1:
                timings.setdefault(i, Timing(pattern, support)).sum(pos, trans)
            i += 1

    return timings