def prepare_sessions(data_dir): actions_per_sessions = {} for session_length in session_lengths: actions_per_sessions[session_length] = [] session_files = glob.glob(data_dir + "/*.csv") cnt = 0 for user_session_file in session_files: cnt += 1 print("preparing %d / %d" % (cnt, len(session_files))) # if cnt == 200: # break with open(user_session_file, 'rt') as sf: ops = list() for line in sf: op = Operation() op.init(line.strip()) ops.append(op) ops.sort(key=operator.attrgetter('ts')) for session_length in session_lengths: # identify all sessions and store the actions per session action_counts = [] actions = 0 last_ts = 0 for op in ops: if actions == 0: last_ts = op.ts + op.execution_time actions = 1 else: if op.ts - last_ts <= session_length: # another action within the session actions += 1 else: # start of a new session action_counts.append(actions) actions = 1 last_ts = op.ts + op.execution_time if actions > 0: action_counts.append(actions) actions_per_sessions[session_length] += (action_counts) for x in sorted(actions_per_sessions.keys()): print("%d - count: %s" % (x, statistics.get_mean_string(actions_per_sessions[x]))) return actions_per_sessions
def analyze_user_session(user_session_file, out_pipeline, target_file_name): with open(user_session_file, 'r') as sf: ops = list() atimes = list() for line in sf: op = Operation() op.init(line.strip()) ops.append(op) atimes.append(op.ts) ops.sort(key=operator.attrgetter('ts')) atimes.sort() window_seconds = find_clusters(atimes) session_counter = 1 uf = os.path.basename(user_session_file) user_id = uf[:uf.find(".user_session.csv")] session = UserSession(user_id) session.window_seconds = window_seconds for op in ops: if session.from_ts == 0: session.from_ts = op.ts session.till_ts = op.ts + op.execution_time if (session.till_ts + window_seconds) < op.ts: # this session is over, so archive it. out_pipeline.write_to(target_file_name, session.finish()) del session session = UserSession(user_id) session.window_seconds = window_seconds session_counter += 1 session.add_op(op) if session.num_ops > 0: out_pipeline.write_to(target_file_name, session.finish()) print("sessions: %d with window_seconds: %d" % (session_counter, window_seconds))