Esempio n. 1
0
def get_sessions_with_ordered(
    events: pd.DataFrame,
    sessUnordered: set,
    funnel: list,
    colName: str,
    strict: bool = True,
) -> list:
    """
    get_sessions_with_ordered returns a list of sessions which contain the specified funnel in the same order. One of its arguments
    is a set of sessions containing URLs in the funnel in any order. If strict is set to True, the order in the funnel is followed
    exactly (no additional URLs in between). (Alternative is currently not implemented)

    :param events: events DataFrame
    :param sessUnordered: set of sessions with URLs of interest in any order
    :param funnel: funnel list
    :param colName: column name to use
    :param strict: if True, enforce the funnel order strictly
    :return: list of sessions containing the funnel
    """
    filteredEvents = utils.filter_events(events, session=list(sessUnordered))
    uniqSids = utils.get_sessions(filteredEvents)
    sessOrdered = []
    for idx, sid in enumerate(uniqSids):
        sess_df = filteredEvents.loc[sid]
        if len(get_sublist_indices(funnel, sess_df[colName].tolist(),
                                   strict)) > 0:
            sessOrdered.append(sid)
    return sessOrdered
Esempio n. 2
0
def build_clicktype_index(df: pd.DataFrame) -> dict:
    sessIndex = defaultdict(list)
    frame = df.loc[df["EventType"] == "click"]
    if "EventModFrustrated" in frame.columns:
        frame_rage = frame.dropna(subset=["EventModFrustrated"])
        sessIndex["rage"] = list(utils.get_sessions(frame_rage))
    else:
        sessIndex["rage"] = []
    if "EventModDead" in frame.columns:
        frame_dead = frame.dropna(subset=["EventModDead"])
        sessIndex["dead"] = list(utils.get_sessions(frame_dead))
    else:
        sessIndex["dead"] = []
    if "EventModError" in frame.columns:
        frame_error = frame.dropna(subset=["EventModError"])
        sessIndex["error"] = list(utils.get_sessions(frame_error))
    else:
        sessIndex["error"] = []
    return sessIndex
def get_funnel_lists(events, sessIndex, funurl, funlen, columnToUse):
    sessions = sessIndex[funurl]
    filteredEvents = utils.filter_events(events, session=list(sessions))
    uniqSids = utils.get_sessions(filteredEvents)
    funnelCounts = defaultdict(int)
    for idx, sid in enumerate(uniqSids):
        sess_df = filteredEvents.loc[sid]
        sess_funnels = get_funnels_for_session(sess_df[columnToUse].tolist(), funurl, funlen)
        for fun in sess_funnels:
            funnelCounts[fun] += 1
    return funnelCounts
Esempio n. 4
0
def add_loop_count(events: pd.DataFrame, colName: str):
    """
    add_loop_count modifies a Dataframe in-place to add a loop count column. Currently the loop count
    alogrithm is extremely naive (it counts the repeating URLs), but we can improve later.

    :param events: pd.Dataframe with events data. It's a MultiIndex with sid as the key.
    :param colName: The name of the column containing the URLs. We made this a parameter so that we can define
    'loops' as we see fit (either original or cleaned up URLs can be used).
    :return:
    """
    unique_session_ids = utils.get_sessions(events)
    for idx, sid in enumerate(unique_session_ids):
        sess_df = events.loc[sid]
        events.loc[sid,
                   NUMBEROFLOOPS] = number_of_loops(sess_df[colName].tolist())
Esempio n. 5
0
def build_session_index(events: pd.DataFrame, colName: str) -> dict:
    """
    build_session_index builds an inverted index of values in 'colName' to list of sessions.

    :param events: events DataFrame
    :param colName: column name to use for building index
    :return: Index of URLs to sets of SIDs
    """
    unique_session_ids = utils.get_sessions(events)
    sessIndex = defaultdict(set)
    for idx, sid in enumerate(unique_session_ids):
        sess_df = events.loc[sid]
        for url in set(sess_df[colName].tolist()):
            sessIndex[url].add(sid)
    return sessIndex
Esempio n. 6
0
def get_funnel_in_outs(
    events: pd.DataFrame,
    sessionIndex: dict,
    funnel: list,
    colName: str,
    referalColName: str,
) -> (dict, dict):
    """
    get_funnel_in_outs returns 2 dictionaries (one for ingress, one for egress) with ingress and egress counts for a
    specified funnel

    :param events: events DataFrame
    :param sessionIndex: inverted index of URLs to SIDs
    :param funnel: funnel list
    :param colName: column name to use
    :param referalColName: referral column name
    :return: dictionaries of ingress and egress counts
    """
    sessFound = get_unordered_sessions_for_funnel(sessionIndex, funnel)
    sessOrdered = get_sessions_with_ordered(events,
                                            sessFound,
                                            funnel,
                                            colName,
                                            strict=True)
    egressCounts = defaultdict(int)
    ingressCounts = defaultdict(int)
    funnelMatches = utils.filter_events(events, session=sessOrdered)
    uniqSids = utils.get_sessions(funnelMatches)
    for idx, sid in enumerate(uniqSids):
        sess_df = funnelMatches.loc[sid]
        for index in get_sublist_indices(funnel,
                                         sess_df[colName].tolist(),
                                         strict=True):
            if index == 0:
                ingress = sess_df.iloc[index].loc[referalColName]
                if not type(ingress) == str:
                    ingress = UNKNOWN
            else:
                ingress = sess_df.iloc[index - 1].loc[colName]
            if index + len(funnel) == len(sess_df[colName].tolist()):
                egress = UNKNOWN
            else:
                egress = sess_df.iloc[index + len(funnel)].loc[colName]
            ingressCounts[ingress] += 1
            egressCounts[egress] += 1
    return ingressCounts, egressCounts