Esempio n. 1
0
def prepare_sessions(data_dir):
    actions_per_sessions = {}
    for session_length in session_lengths:
        actions_per_sessions[session_length] = []

    session_files = glob.glob(data_dir + "/*.csv")
    cnt = 0
    for user_session_file in session_files:
        cnt += 1
        print("preparing %d / %d" % (cnt, len(session_files)))

        # if cnt == 200:
        #     break

        with open(user_session_file, 'rt') as sf:
            ops = list()

            for line in sf:
                op = Operation()
                op.init(line.strip())
                ops.append(op)
            ops.sort(key=operator.attrgetter('ts'))

            for session_length in session_lengths:
                # identify all sessions and store the actions per session
                action_counts = []
                actions = 0
                last_ts = 0
                for op in ops:
                    if actions == 0:
                        last_ts = op.ts + op.execution_time
                        actions = 1
                    else:
                        if op.ts - last_ts <= session_length:
                            # another action within the session
                            actions += 1
                        else:
                            # start of a new session
                            action_counts.append(actions)
                            actions = 1
                        last_ts = op.ts + op.execution_time

                if actions > 0:
                    action_counts.append(actions)

                actions_per_sessions[session_length] += (action_counts)

    for x in sorted(actions_per_sessions.keys()):
        print("%d - count: %s" %
              (x, statistics.get_mean_string(actions_per_sessions[x])))

    return actions_per_sessions
Esempio n. 2
0
def analyze_user_session(user_session_file, out_pipeline, target_file_name):
    with open(user_session_file, 'r') as sf:
        ops = list()
        atimes = list()

        for line in sf:
            op = Operation()
            op.init(line.strip())
            ops.append(op)
            atimes.append(op.ts)

        ops.sort(key=operator.attrgetter('ts'))
        atimes.sort()
        window_seconds = find_clusters(atimes)

        session_counter = 1

        uf = os.path.basename(user_session_file)
        user_id = uf[:uf.find(".user_session.csv")]

        session = UserSession(user_id)
        session.window_seconds = window_seconds

        for op in ops:
            if session.from_ts == 0:
                session.from_ts = op.ts
                session.till_ts = op.ts + op.execution_time

            if (session.till_ts + window_seconds) < op.ts:
                # this session is over, so archive it.
                out_pipeline.write_to(target_file_name, session.finish())
                del session
                session = UserSession(user_id)
                session.window_seconds = window_seconds
                session_counter += 1

            session.add_op(op)

        if session.num_ops > 0:
            out_pipeline.write_to(target_file_name, session.finish())

        print("sessions: %d with window_seconds: %d" %
              (session_counter, window_seconds))