def cluster(parser, k): """ general method for clustering data """ #get index number for every page code_book = parser.get_data_encoding(page_min_occurance=5) #use only sequence of pages visited simple_session = [session for session in parser.get_simple_sessions() if config.session_filter_fn(session)] #use vector representation (v1,v2,v2) where v1 means page v1 was visited #models = session_modeling.convert_sessions_to_vector(simple_session, code_book, binary=True) #construct markov chains, estimate transition probabilities models = session_modeling.convert_sessions_to_markov(simple_session, code_book, bayes=False) idx, sse, _ = Pycluster.kcluster(models, k, method='a', dist='e') #idx, sse, _ = cluster_kmedoids(models, k, string_similarity.jaccard_distance) clusters = {} for name, clusterid in zip(simple_session, idx): clusters.setdefault(clusterid, []).append(name) return clusters, sse
def deocarate_timings(patterns, log, total): """ return a list of average times spent on each page, based on transaction""" transactions = log.sessions patterns = sorted(patterns, key=operator.itemgetter(1), reverse=True) timings = {} for trans in transactions.values(): if not config.session_filter_fn(trans): continue i = 0 for pattern, support in patterns: pos = contains(trans, pattern) if pos !=-1: timings.setdefault(i, Timing(pattern, support)).sum(pos, trans) i+=1 return timings
def deocarate_timings(patterns, log, total): """ return a list of average times spent on each page, based on transaction""" transactions = log.sessions patterns = sorted(patterns, key=operator.itemgetter(1), reverse=True) timings = {} for trans in transactions.values(): if not config.session_filter_fn(trans): continue i = 0 for pattern, support in patterns: pos = contains(trans, pattern) if pos != -1: timings.setdefault(i, Timing(pattern, support)).sum(pos, trans) i += 1 return timings